From 25e74b479a42ce6301e37c4f09c433aba5f129a7 Mon Sep 17 00:00:00 2001 From: Kristen Laricchia Date: Mon, 7 Oct 2024 14:27:38 -0400 Subject: [PATCH 1/8] add federated validity check --- .../federated_validity_checks.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 gnomad_qc/v5/data_ingestion/federated_validity_checks.py diff --git a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py new file mode 100644 index 000000000..365a78160 --- /dev/null +++ b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py @@ -0,0 +1,50 @@ +"""Script to generate annotations for variant QC on gnomAD v4.""" + +import argparse +import logging + +import hail as hl + +from gnomad.assessment.validity_checks import summarize_variants +from gnomad.resources.grch38.gnomad import public_release + + +logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") +logger = logging.getLogger("federated_validity_checks") +logger.setLevel(logging.INFO) + + +def main(args): + """Perform validity checks for federated data.""" + hl.init( + log="/federated_validity_checks.log", + tmp_dir="gs://gnomad-tmp-4day", + ) + hl.default_reference("GRCh38") + test = args.test + + # TODO: Add resources to intake federated data once obtained. + if test: + ht = public_release(data_type="exomes").ht() + ht = ht._filter_partitions(range(2)) + + expected_contigs = [ + i + for i in hl.get_reference("GRCh38").contigs + if i in [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"] + ] + logger.info("Summarizing variants and checking contigs.") + summarize_variants(ht, expected_contigs=expected_contigs) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--test", + help="Use the first two parititons of the gnomAD v4 exomes public release for testing.", + action="store_true", + ) + + args = parser.parse_args() + main(args) From 7abcd7bd3c01126f0c8c5ebf251130ba5e47af33 Mon Sep 17 00:00:00 2001 From: Kristen Laricchia Date: Tue, 8 Oct 2024 13:26:50 -0400 Subject: [PATCH 2/8] PR suggestions --- .../federated_validity_checks.py | 40 ++++++++++++------- gnomad_qc/v5/resources/basics.py | 23 +++++++++++ 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py index 365a78160..847b31235 100644 --- a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py +++ b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py @@ -7,6 +7,7 @@ from gnomad.assessment.validity_checks import summarize_variants from gnomad.resources.grch38.gnomad import public_release +from gnomad_qc.v5.resources.basics import get_logging_path logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") @@ -21,29 +22,40 @@ def main(args): tmp_dir="gs://gnomad-tmp-4day", ) hl.default_reference("GRCh38") - test = args.test + test_n_partitions = args.test_n_partitions - # TODO: Add resources to intake federated data once obtained. - if test: + try: + # TODO: Add resources to intake federated data once obtained. ht = public_release(data_type="exomes").ht() - ht = ht._filter_partitions(range(2)) - expected_contigs = [ - i - for i in hl.get_reference("GRCh38").contigs - if i in [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"] - ] - logger.info("Summarizing variants and checking contigs.") - summarize_variants(ht, expected_contigs=expected_contigs) + if test_n_partitions: + ht = ht._filter_partitions(range(test_n_partitions)) + + expected_contigs = [ + i + for i in hl.get_reference("GRCh38").contigs + if i in [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"] + ] + logger.info("Summarizing variants and checking contigs.") + summarize_variants(ht, expected_contigs=expected_contigs) + + finally: + logger.info("Copying hail log to logging bucket...") + hl.copy_log(get_logging_path("federated_validity_checks")) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--test", - help="Use the first two parititons of the gnomAD v4 exomes public release for testing.", - action="store_true", + "--test-n-partitions", + help=( + "Use only N partitions of the input for testing purposes. Defaults" + "to 2 if passed without a value." + ), + nargs="?", + const=2, + type=int, ) args = parser.parse_args() diff --git a/gnomad_qc/v5/resources/basics.py b/gnomad_qc/v5/resources/basics.py index 5a4053ede..70b633226 100644 --- a/gnomad_qc/v5/resources/basics.py +++ b/gnomad_qc/v5/resources/basics.py @@ -6,3 +6,26 @@ dragen_tgp_vds = VariantDatasetResource( "gs://gnomad/v5.0/testing/genomes/dragen_tgp_v5.0_test.vds" ) + + +# TODO: Change default to CURRENT_VERSION. +def qc_temp_prefix(version: str = "5.0") -> str: + """ + Return path to temporary QC bucket. + + :param version: Version of annotation path to return. + :return: Path to bucket with temporary QC data. + """ + return f"gs://gnomad-tmp/gnomad.genomes.v{version}.qc_data/" + + +# TODO: Change default to CURRENT_VERSION. +def get_logging_path(name: str, version: str = "5.0") -> str: + """ + Create a path for Hail log files. + + :param name: Name of log file. + :param version: Version of annotation path to return. + :return: Output log path. + """ + return f"{qc_temp_prefix(version)}{name}.log" From 9450302a1e81dcfaf616d5c02352804847264c7d Mon Sep 17 00:00:00 2001 From: Kristen Laricchia Date: Tue, 8 Oct 2024 13:32:52 -0400 Subject: [PATCH 3/8] small edit --- gnomad_qc/v5/data_ingestion/federated_validity_checks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py index 847b31235..4c71afb23 100644 --- a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py +++ b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py @@ -4,7 +4,6 @@ import logging import hail as hl - from gnomad.assessment.validity_checks import summarize_variants from gnomad.resources.grch38.gnomad import public_release from gnomad_qc.v5.resources.basics import get_logging_path From 6a8f7c6e407236328542e6b7d6efc51d866938d8 Mon Sep 17 00:00:00 2001 From: Kristen Laricchia Date: Tue, 8 Oct 2024 13:40:09 -0400 Subject: [PATCH 4/8] small edit --- gnomad_qc/v5/data_ingestion/federated_validity_checks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py index 4c71afb23..beeefdc19 100644 --- a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py +++ b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py @@ -6,6 +6,7 @@ import hail as hl from gnomad.assessment.validity_checks import summarize_variants from gnomad.resources.grch38.gnomad import public_release + from gnomad_qc.v5.resources.basics import get_logging_path From 69eaf3f85fbadc9581964de9fe44829971ed278e Mon Sep 17 00:00:00 2001 From: Kristen Laricchia Date: Tue, 8 Oct 2024 13:47:59 -0400 Subject: [PATCH 5/8] small edit --- gnomad_qc/v5/data_ingestion/federated_validity_checks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py index beeefdc19..734e77ce3 100644 --- a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py +++ b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py @@ -6,10 +6,8 @@ import hail as hl from gnomad.assessment.validity_checks import summarize_variants from gnomad.resources.grch38.gnomad import public_release - from gnomad_qc.v5.resources.basics import get_logging_path - logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger("federated_validity_checks") logger.setLevel(logging.INFO) From aa3f97ff22b879409b1da389e73b2837b0b46989 Mon Sep 17 00:00:00 2001 From: Kristen Laricchia Date: Tue, 8 Oct 2024 13:59:18 -0400 Subject: [PATCH 6/8] isort --- gnomad_qc/v5/data_ingestion/federated_validity_checks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py index 734e77ce3..5da8403b0 100644 --- a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py +++ b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py @@ -6,6 +6,7 @@ import hail as hl from gnomad.assessment.validity_checks import summarize_variants from gnomad.resources.grch38.gnomad import public_release + from gnomad_qc.v5.resources.basics import get_logging_path logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") From 7467ef73903042b8d6977a2159a3b379631206f1 Mon Sep 17 00:00:00 2001 From: Kristen Laricchia Date: Tue, 8 Oct 2024 16:03:13 -0400 Subject: [PATCH 7/8] add logger and update default --- gnomad_qc/v5/data_ingestion/federated_validity_checks.py | 1 + gnomad_qc/v5/resources/basics.py | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py index 5da8403b0..807e4ec47 100644 --- a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py +++ b/gnomad_qc/v5/data_ingestion/federated_validity_checks.py @@ -28,6 +28,7 @@ def main(args): ht = public_release(data_type="exomes").ht() if test_n_partitions: + logger.info("Filtering to %d partitions.", test_n_partitions) ht = ht._filter_partitions(range(test_n_partitions)) expected_contigs = [ diff --git a/gnomad_qc/v5/resources/basics.py b/gnomad_qc/v5/resources/basics.py index 70b633226..fb63e3bc8 100644 --- a/gnomad_qc/v5/resources/basics.py +++ b/gnomad_qc/v5/resources/basics.py @@ -1,6 +1,7 @@ """Script containing generic resources.""" from gnomad.resources.resource_utils import VariantDatasetResource +from gnomad_qc.v5.resources.constants import CURRENT_VERSION # v5 DRAGEN TGP test VDS. dragen_tgp_vds = VariantDatasetResource( @@ -8,8 +9,7 @@ ) -# TODO: Change default to CURRENT_VERSION. -def qc_temp_prefix(version: str = "5.0") -> str: +def qc_temp_prefix(version: str = CURRENT_VERSION) -> str: """ Return path to temporary QC bucket. @@ -19,8 +19,7 @@ def qc_temp_prefix(version: str = "5.0") -> str: return f"gs://gnomad-tmp/gnomad.genomes.v{version}.qc_data/" -# TODO: Change default to CURRENT_VERSION. -def get_logging_path(name: str, version: str = "5.0") -> str: +def get_logging_path(name: str, version: str = CURRENT_VERSION) -> str: """ Create a path for Hail log files. From 7f1f77cbfaea0ea4cc34ded39447b2258b34dbca Mon Sep 17 00:00:00 2001 From: Kristen Laricchia Date: Tue, 8 Oct 2024 16:06:04 -0400 Subject: [PATCH 8/8] isort --- gnomad_qc/v5/resources/basics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gnomad_qc/v5/resources/basics.py b/gnomad_qc/v5/resources/basics.py index fb63e3bc8..73c40f9fb 100644 --- a/gnomad_qc/v5/resources/basics.py +++ b/gnomad_qc/v5/resources/basics.py @@ -1,6 +1,7 @@ """Script containing generic resources.""" from gnomad.resources.resource_utils import VariantDatasetResource + from gnomad_qc.v5.resources.constants import CURRENT_VERSION # v5 DRAGEN TGP test VDS.