Skip to content

Commit

Permalink
Merge pull request #640 from broadinstitute/kl/validity_fed
Browse files Browse the repository at this point in the history
Add federated validity check
  • Loading branch information
klaricch authored Oct 15, 2024
2 parents 27fbe56 + 7f1f77c commit 6fa7895
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 0 deletions.
62 changes: 62 additions & 0 deletions gnomad_qc/v5/data_ingestion/federated_validity_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Script to generate annotations for variant QC on gnomAD v4."""

import argparse
import logging

import hail as hl
from gnomad.assessment.validity_checks import summarize_variants
from gnomad.resources.grch38.gnomad import public_release

from gnomad_qc.v5.resources.basics import get_logging_path

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger("federated_validity_checks")
logger.setLevel(logging.INFO)


def main(args):
"""Perform validity checks for federated data."""
hl.init(
log="/federated_validity_checks.log",
tmp_dir="gs://gnomad-tmp-4day",
)
hl.default_reference("GRCh38")
test_n_partitions = args.test_n_partitions

try:
# TODO: Add resources to intake federated data once obtained.
ht = public_release(data_type="exomes").ht()

if test_n_partitions:
logger.info("Filtering to %d partitions.", test_n_partitions)
ht = ht._filter_partitions(range(test_n_partitions))

expected_contigs = [
i
for i in hl.get_reference("GRCh38").contigs
if i in [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"]
]
logger.info("Summarizing variants and checking contigs.")
summarize_variants(ht, expected_contigs=expected_contigs)

finally:
logger.info("Copying hail log to logging bucket...")
hl.copy_log(get_logging_path("federated_validity_checks"))


if __name__ == "__main__":
parser = argparse.ArgumentParser()

parser.add_argument(
"--test-n-partitions",
help=(
"Use only N partitions of the input for testing purposes. Defaults"
"to 2 if passed without a value."
),
nargs="?",
const=2,
type=int,
)

args = parser.parse_args()
main(args)
23 changes: 23 additions & 0 deletions gnomad_qc/v5/resources/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,30 @@

from gnomad.resources.resource_utils import VariantDatasetResource

from gnomad_qc.v5.resources.constants import CURRENT_VERSION

# v5 DRAGEN TGP test VDS.
dragen_tgp_vds = VariantDatasetResource(
"gs://gnomad/v5.0/testing/genomes/dragen_tgp_v5.0_test.vds"
)


def qc_temp_prefix(version: str = CURRENT_VERSION) -> str:
"""
Return path to temporary QC bucket.
:param version: Version of annotation path to return.
:return: Path to bucket with temporary QC data.
"""
return f"gs://gnomad-tmp/gnomad.genomes.v{version}.qc_data/"


def get_logging_path(name: str, version: str = CURRENT_VERSION) -> str:
"""
Create a path for Hail log files.
:param name: Name of log file.
:param version: Version of annotation path to return.
:return: Output log path.
"""
return f"{qc_temp_prefix(version)}{name}.log"

0 comments on commit 6fa7895

Please sign in to comment.