From 52a4eaac27ced3ee8f15ee8c2a1ac0034c74860f Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 30 Oct 2023 16:50:33 +0000 Subject: [PATCH 01/40] draft streaming with generators --- pgscatalog_utils/download/GenomeBuild.py | 12 ++ .../scorefile/combine_scorefiles.py | 157 +++--------------- pgscatalog_utils/scorefile/config.py | 9 + pgscatalog_utils/scorefile/effect_type.py | 34 ---- pgscatalog_utils/scorefile/effect_weight.py | 49 ------ pgscatalog_utils/scorefile/genome_build.py | 24 --- pgscatalog_utils/scorefile/harmonised.py | 30 ---- pgscatalog_utils/scorefile/header.py | 80 +++++++++ pgscatalog_utils/scorefile/liftover.py | 103 ------------ pgscatalog_utils/scorefile/qc.py | 103 +++--------- pgscatalog_utils/scorefile/read.py | 79 --------- pgscatalog_utils/scorefile/scoringfile.py | 144 ++++++++++++++++ pgscatalog_utils/scorefile/write.py | 43 ----- 13 files changed, 292 insertions(+), 575 deletions(-) create mode 100644 pgscatalog_utils/scorefile/config.py delete mode 100644 pgscatalog_utils/scorefile/effect_type.py delete mode 100644 pgscatalog_utils/scorefile/effect_weight.py delete mode 100644 pgscatalog_utils/scorefile/genome_build.py delete mode 100644 pgscatalog_utils/scorefile/harmonised.py create mode 100644 pgscatalog_utils/scorefile/header.py delete mode 100644 pgscatalog_utils/scorefile/liftover.py delete mode 100644 pgscatalog_utils/scorefile/read.py create mode 100644 pgscatalog_utils/scorefile/scoringfile.py delete mode 100644 pgscatalog_utils/scorefile/write.py diff --git a/pgscatalog_utils/download/GenomeBuild.py b/pgscatalog_utils/download/GenomeBuild.py index 419c3f2..893bf97 100644 --- a/pgscatalog_utils/download/GenomeBuild.py +++ b/pgscatalog_utils/download/GenomeBuild.py @@ -4,3 +4,15 @@ class GenomeBuild(Enum): GRCh37 = auto() GRCh38 = auto() + + @classmethod + def from_string(cls, build): + match build: + case 'GRCh37' | 'hg18': + return cls(GenomeBuild.GRCh37) + case 'GRCh38' | 'hg19': + return cls(GenomeBuild.GRCh38) + case 'NR': + return None + case _: + raise Exception \ No newline at end of file diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 9465484..7dd5fc8 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -1,39 +1,13 @@ import argparse import logging -import os import sys import textwrap -import json +import time from pgscatalog_utils.config import set_logging_level -from pgscatalog_utils.scorefile.effect_type import set_effect_type -from pgscatalog_utils.scorefile.effect_weight import melt_effect_weights -from pgscatalog_utils.scorefile.genome_build import build2GRC -from pgscatalog_utils.scorefile.harmonised import remap_harmonised -from pgscatalog_utils.scorefile.liftover import liftover -from pgscatalog_utils.scorefile.qc import quality_control -from pgscatalog_utils.scorefile.read import load_scorefile, get_scorefile_basename -from pgscatalog_utils.scorefile.write import write_scorefile - - -headers2logs = [ - 'pgs_id', - 'pgp_id', - 'pgs_name', - 'genome_build', - 'variants_number', - 'trait_reported', - 'trait_efo', - 'trait_mapped', - 'weight_type', - 'citation' -] -headers2logs_harmonisation = [ - 'HmPOS_build', - 'HmPOS_date', - 'HmPOS_match_chr', - 'HmPOS_match_pos' -] +from pgscatalog_utils.download.GenomeBuild import GenomeBuild +from pgscatalog_utils.scorefile.scoringfile import ScoringFile + def combine_scorefiles(): args = _parse_args() @@ -44,103 +18,21 @@ def combine_scorefiles(): paths: list[str] = list(set(args.scorefiles)) # unique paths only logger.debug(f"Input scorefiles: {paths}") - if os.path.exists(args.outfile): - logger.critical(f"Output file {args.outfile} already exists") + start_time = time.time() + sfs = [ScoringFile.from_path(x) for x in paths] + + target_build = GenomeBuild.from_string(args.target_build) + bad_builds = [x.name for x in sfs if x.genome_build != target_build] + for bad_file in bad_builds: + logger.critical(f"{bad_file} doesn't match {target_build}, can't combine") raise Exception + else: + logger.info(f"All builds match target build {target_build}") - # Score header logs - init - score_logs = {} - dir_output = os.path.dirname(args.outfile) - if dir_output == '': - dir_output = './' - elif dir_output.endswith('/') is False: - dir_output += '/' - json_logs_file = dir_output + args.logfile - - for x in paths: - # Read scorefile df and header - h, score = load_scorefile(x) - score_shape_original = score.shape - - if score.empty: - logger.critical(f"Empty scorefile {x} detected! Please check the input data") - raise Exception - - # Check if we should use the harmonized positions - use_harmonised = False - current_build = None - if h.get('HmPOS_build') is not None: - if h.get('HmPOS_build') == args.target_build: - use_harmonised = True - current_build = h.get('HmPOS_build') - else: - logger.error( - f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}") - raise Exception - - # Process/QC score and check variant columns - score = (score.pipe(remap_harmonised, use_harmonised=use_harmonised) - .pipe(quality_control, drop_missing=args.drop_missing) - .pipe(melt_effect_weights) - .pipe(set_effect_type)) - - # Annotate score with the genome_build (in GRCh notation) - if current_build is None: - current_build = build2GRC(h.get('genome_build')) - if current_build is None: - logger.error("Scorefile has no build information, " - "please add the build to the header with " - "('#genome_build=[insert variant build]") - raise Exception - - score = score.assign(genome_build=current_build) - - if (current_build != args.target_build) and (args.liftover is False): - logger.error( - f"Cannot combine {x} (build={h.get('genome_build')}) with target build {args.target_build} without liftover") - logger.error("Try running with --liftover and specifying the --chain_dir") - raise Exception - - if args.liftover: - logger.debug("Annotating scorefile with liftover parameters") - score = liftover(score, args.chain_dir, args.min_lift, args.target_build) - - if score.empty and (args.drop_missing is False): - logger.critical("Empty output score detected, something went wrong while combining") - raise Exception - - write_scorefile(score, args.outfile) - - # Build Score header logs - score_id = get_scorefile_basename(x) - score_header = score_logs[score_id] = {} - # Scoring file header information - for header in headers2logs: - header_val = h.get(header) - if (header in ['trait_efo', 'trait_mapped']) and (header_val is not None): - header_val = header_val.split('|') - score_header[header] = header_val - # Other header information - score_header['columns'] = list(score.columns) - score_header['use_liftover'] = False - if args.liftover: - score_header['use_liftover'] = True - # Harmonized header information - score_header['use_harmonised'] = use_harmonised - if use_harmonised: - score_header['sources'] = sorted(score['hm_source'].unique().tolist()) - for hm_header in headers2logs_harmonisation: - hm_header_val = h.get(hm_header) - if hm_header_val: - if hm_header.startswith('HmPOS_match'): - hm_header_val = json.loads(hm_header_val) - score_header[hm_header] = hm_header_val - if score_header['variants_number'] is None: - score_header['variants_number'] = score_shape_original[0] - - # Write Score header logs file - with open(json_logs_file, 'w') as fp: - json.dump(score_logs, fp, indent=4) + ScoringFile.write_combined(sfs, args.outfile) + end_time = time.time() + elapsed_time = end_time - start_time + print(f"Elapsed time: {elapsed_time} seconds") def _description_text() -> str: @@ -164,16 +56,21 @@ def _epilog_text() -> str: def _parse_args(args=None) -> argparse.Namespace: - parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(), + parser = argparse.ArgumentParser(description=_description_text(), + epilog=_epilog_text(), formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-s', '--scorefiles', dest='scorefiles', nargs='+', - help=' Scorefile path (wildcard * is OK)', required=True) + help=' Scorefile path (wildcard * is OK)', + required=True) parser.add_argument('--liftover', dest='liftover', - help=' Convert scoring file variants to target genome build?', action='store_true') + help=' Convert scoring file variants to target genome build?', + action='store_true') parser.add_argument('-t', '--target_build', dest='target_build', - choices=['GRCh37', 'GRCh38'], help=' Build of target genome', + choices=['GRCh37', 'GRCh38'], + help=' Build of target genome', required=True) - parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files', + parser.add_argument('-c', '--chain_dir', dest='chain_dir', + help='Path to directory containing chain files', required="--liftover" in sys.argv) parser.add_argument('-m', '--min_lift', dest='min_lift', help=' If liftover, minimum proportion of variants lifted over', diff --git a/pgscatalog_utils/scorefile/config.py b/pgscatalog_utils/scorefile/config.py new file mode 100644 index 0000000..8dee364 --- /dev/null +++ b/pgscatalog_utils/scorefile/config.py @@ -0,0 +1,9 @@ +from dataclasses import dataclass + + +@dataclass +class Config: + drop_missing: bool + liftover: bool + chain_dir: str + min_lift: float \ No newline at end of file diff --git a/pgscatalog_utils/scorefile/effect_type.py b/pgscatalog_utils/scorefile/effect_type.py deleted file mode 100644 index 50c8c73..0000000 --- a/pgscatalog_utils/scorefile/effect_type.py +++ /dev/null @@ -1,34 +0,0 @@ -import logging - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def set_effect_type(df: pd.DataFrame) -> pd.DataFrame: - if {'is_recessive', 'is_dominant'}.issubset(df.columns): - _check_effect_types(df) - return (df.assign(additive=lambda x: ~x["is_recessive"] & ~x["is_dominant"]) - .assign(effect_type=lambda x: x[["is_recessive", "is_dominant", "additive"]].idxmax(1))) - else: - return _set_default_effect_type(df) - - -def _check_effect_types(df: pd.DataFrame): - """ Check that only one effect type is set per variant """ - bad_rows: pd.DataFrame = df[['is_dominant', 'is_recessive']].all(axis=1).any() - - error = ''' ERROR: Bad variants in scorefile - is_recessive and is_dominant columns are both TRUE for a variant - These columns are mutually exclusive (both can't be true) - However, both can be FALSE for additive variant scores - ''' - if bad_rows: - logger.error(error) - logger.error(bad_rows) - raise Exception - - -def _set_default_effect_type(df: pd.DataFrame, effect_type: str = "additive") -> pd.DataFrame: - logger.debug(f'No effect types set, using default ({effect_type})') - return df.assign(effect_type=effect_type) diff --git a/pgscatalog_utils/scorefile/effect_weight.py b/pgscatalog_utils/scorefile/effect_weight.py deleted file mode 100644 index 4b95e0f..0000000 --- a/pgscatalog_utils/scorefile/effect_weight.py +++ /dev/null @@ -1,49 +0,0 @@ -import logging -import re - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def melt_effect_weights(df: pd.DataFrame) -> pd.DataFrame: - """ Ensure all dataframes are in long format, with one effect weight column and a score accession column """ - elongate = _detect_multiple_weight_columns(df) - - if elongate: - logger.debug("Melting effect weights") - return _melt(df) - else: - logger.debug("Skipping melt") - df['accession'] = df['filename'] - return df - - -def _detect_multiple_weight_columns(df: pd.DataFrame) -> bool: - """ Detect if multiple effect weight columns are present - - Single weight format: - | chr_name | chr_pos | effect_allele | effect_weight - - Multiple weight format: - | chr_name | chr_pos | effect_allele | effect_weight_score_1 | ... | effect_weight_score_n - """ - columns: list[re.match | None] = [re.search("^effect_weight$", x) for x in df.columns.to_list()] - columns_suffix: list[re.match | None] = [re.search("^effect_weight_[A-Za-z0-9]+$", x) for x - in df.columns.to_list()] - - if any([col for col in columns]): - logger.debug("Single effect weight column detected") - return False - elif any([col for col in columns_suffix]): - logger.debug("Multiple weight weight columns detected") - return True - else: - logger.error("ERROR: Missing valid effect weight columns") - raise Exception("Bad effect weights") - - -def _melt(df: pd.DataFrame) -> pd.DataFrame: - """ Melt a multiple effect weight format """ - ew_cols: list[str] = df.filter(regex="effect_weight_*").columns.to_list() - return df.melt(value_vars=ew_cols, value_name="effect_weight", var_name="accession") diff --git a/pgscatalog_utils/scorefile/genome_build.py b/pgscatalog_utils/scorefile/genome_build.py deleted file mode 100644 index 7ea4f09..0000000 --- a/pgscatalog_utils/scorefile/genome_build.py +++ /dev/null @@ -1,24 +0,0 @@ -import logging - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def annotate_build(df: pd.DataFrame, target_build: str) -> pd.DataFrame: - """ Annotate the dataframe with genome build data """ - logger.debug(f"Annotating target build: {target_build}") - build_dict: dict = {'GRCh37': 'hg19', 'GRCh38': 'hg38', 'hg19': 'hg19', 'hg38': 'hg38'} # standardise build names - df['chain_target_build'] = build_dict[target_build] - df = df.assign(chain_genome_build=[build_dict[x] for x in df['genome_build']]) - return df - - -def build2GRC(build): - """Map build names so they can be compared with GRCh37 and 38""" - build_2_GRC_dict = {'GRCh37': 'GRCh37', 'GRCh38': 'GRCh38', 'hg19': 'GRCh37', - 'hg38': 'GRCh38'} # standardise build names - if pd.isnull(build): - return None - else: - return build_2_GRC_dict.get(build) diff --git a/pgscatalog_utils/scorefile/harmonised.py b/pgscatalog_utils/scorefile/harmonised.py deleted file mode 100644 index b56fb93..0000000 --- a/pgscatalog_utils/scorefile/harmonised.py +++ /dev/null @@ -1,30 +0,0 @@ -import logging -import re - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def remap_harmonised(df: pd.DataFrame, use_harmonised) -> pd.DataFrame: - """ Replace original columns with harmonised data, if available and appropriate """ - - if any([re.match("hm_\\w+", x) for x in df.columns]) and use_harmonised: - logger.debug("Harmonised columns detected and used") - hm_colnames: dict[str: str] = {'hm_chr': 'chr_name', 'hm_pos': 'chr_position', - 'hm_inferOtherAllele': 'other_allele'} - - if 'other_allele' not in df or all(df['other_allele'].isnull()): - logger.debug("other_allele column contains no information, replacing with hm_inferOtherAllele") - return (df.drop(['chr_name', 'chr_position', 'other_allele'], axis=1, errors='ignore') - .rename(hm_colnames, axis=1)) - else: - logger.debug("other_allele column contains information, dropping hm_inferOtherAllele") - return (df.drop(['chr_name', 'chr_position', 'hm_inferOtherAllele'], axis=1, errors='ignore') - .rename(hm_colnames, axis=1)) - elif any([re.match("hm_\\w+", x) for x in df.columns]) and not use_harmonised: - logger.debug(f"Harmonised columns detected but not used (use_harmonised={use_harmonised})") - return df - else: - logger.debug("Harmonised columns not detected") - return df diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py new file mode 100644 index 0000000..7fc0e4e --- /dev/null +++ b/pgscatalog_utils/scorefile/header.py @@ -0,0 +1,80 @@ +import gzip +import pathlib +from dataclasses import dataclass + +from pgscatalog_utils.download.GenomeBuild import GenomeBuild + + +@dataclass +class ScoringFileHeader: + pgs_id: str + pgp_id: str + trait_efo: str + trait_reported: str + trait_mapped: str + pgs_name: str + genome_build: GenomeBuild + HmPOS_build: GenomeBuild + variants_number: int + format_version: str + citation: str + + def __post_init__(self): + self.variants_number = int(self.variants_number) + self.genome_build = GenomeBuild.from_string(self.genome_build) + if self.HmPOS_build: + self.HmPOS_build = GenomeBuild.from_string(self.HmPOS_build) + + if self.format_version != '2.0': + raise Exception("Only support v2 format") + + @classmethod + def from_path(cls, path: pathlib.Path): + raw_header: dict = raw_header_to_dict(read_header(path)) + # only keep keys needed by class (intersect) + keep_keys = ScoringFileHeader.__annotations__.keys() + header_dict = {k: raw_header[k] for k in raw_header.keys() & keep_keys} + # ... so we can unpack the dict into a dataclass + + if len(header_dict) > 1 and 'HmPOS_build' not in header_dict: + # working with pgs catalog formatted header but unharmonised data + header_dict['HmPOS_build'] = None + + if header_dict: + return ScoringFileHeader(**header_dict) + else: + # no header available + return None + + +def raw_header_to_dict(header): + d = {} + for item in header: + key, value = item.split('=') + d[key[1:]] = value # drop # character from key + return d + + +def read_header(path: pathlib.Path): + """Parses the header of a PGS Catalog format scorefile into a dictionary""" + open_function = auto_open(path) + with open_function(path, 'rt') as f: + yield from _gen_header_lines(f) + + +def _gen_header_lines(f): + for line in f: + if line.startswith('#'): + if '=' in line: + yield line.strip() + else: + # stop reading lines + break + + +def auto_open(filepath): + with open(filepath, 'rb') as test_f: + if test_f.read(2) == b'\x1f\x8b': + return gzip.open + else: + return open diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py deleted file mode 100644 index 45258b1..0000000 --- a/pgscatalog_utils/scorefile/liftover.py +++ /dev/null @@ -1,103 +0,0 @@ -import logging -import os - -import pandas as pd -import pyliftover - -from pgscatalog_utils.scorefile.genome_build import annotate_build - -logger = logging.getLogger(__name__) - - -def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: str) -> pd.DataFrame: - """ Liftover genomic coordinates to a different genome build """ - df = annotate_build(df, target_build) # get chain_target_build (e.g. in hg notation to match chain files) - - mapped, unmapped = pd.DataFrame(), pd.DataFrame() - no_liftover: pd.DataFrame = df.query('chain_target_build == chain_genome_build') - to_liftover: pd.DataFrame = df.query('chain_target_build != chain_genome_build') - - if no_liftover.empty: - logger.debug("Liftover required for all scorefile variants") - else: - logger.debug("Skipping liftover for scorefiles with same build as target genome") - no_liftover.loc[:, ['lifted_chr', 'lifted_pos']] = no_liftover[ - ['chr_name', 'chr_position']] # assume col structure - no_liftover.assign(liftover=None) - - if to_liftover.empty: - logger.debug("Liftover skipped because no variants required it") - else: - lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir) # loads chain files - logger.debug("Lifting over scoring files") - lifted: pd.DataFrame = to_liftover.apply(_convert_coordinates, axis=1, lo_dict=lo) - to_liftover = pd.concat([to_liftover, lifted], axis=1) - logger.debug("Liftover complete") - - mapped: pd.DataFrame = (to_liftover[~to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)] - .assign(liftover=True)) - unmapped: pd.DataFrame = (to_liftover[to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)] \ - .assign(liftover=False)) - _check_min_liftover(mapped, unmapped, min_lift) - - return pd.concat([mapped, unmapped, no_liftover]) - - -def _check_min_liftover(mapped: pd.DataFrame, unmapped: pd.DataFrame, min_lift: float) -> None: - """ Check that liftover process met minimum parameters""" - df = pd.concat([mapped, unmapped]) - n_variants: pd.DataFrame = (pd.DataFrame(df.groupby('accession')['liftover'].count()) - .reset_index() - .rename({'liftover': 'n_var'}, axis=1)) - lo_counts = (pd.DataFrame(df.groupby(['accession', 'liftover'])['liftover'].count()) \ - .rename_axis(['accession', 'liftover_status']) - .reset_index()) - summary: pd.DataFrame = lo_counts.merge(n_variants, on='accession') - summary['proportion'] = summary['liftover'] / summary['n_var'] - - for row in summary.query('liftover_status == True')[['accession', 'proportion']].itertuples(): - if row.proportion < min_lift: - logger.error(f'Liftover failed for scorefile {row.accession}') - logger.error(f'{row.proportion} of variants lifted over, less than min_lift parameter ({min_lift})') - raise Exception - else: - logger.debug(f'Minimum liftover threshold passed for scorefile {row.accession}') - - -def _convert_coordinates(df: pd.Series, lo_dict: dict[str, pyliftover.LiftOver]) -> pd.Series: - """ Convert genomic coordinates to different build """ - converted: list[tuple[str, int, str, int]] | None - - if df[['chr_name', 'chr_position']].isnull().values.any(): - converted = None - else: - lo = lo_dict[df['chain_genome_build'] + df['chain_target_build']] # extract lo object from dict - chrom: str = 'chr' + str(df['chr_name']) - pos: int = int(df['chr_position']) - 1 # liftOver is 0 indexed, VCF is 1 indexed - # converted example: [('chr22', 15460378, '+', 3320966530)] or None - converted = lo.convert_coordinate(chrom, pos) - - if converted: - lifted_chrom: str = _parse_lifted_chrom(converted[0][0][3:]) # return first matching liftover - lifted_pos: int = int(converted[0][1]) + 1 # reverse 0 indexing - return pd.Series([lifted_chrom, lifted_pos], index=['lifted_chr', 'lifted_pos']) - else: - return pd.Series([None, None], index=['lifted_chr', 'lifted_pos']) - - -def _parse_lifted_chrom(i: str) -> str: - """ Convert lifted chromosomes to tidy integers - - liftover needs chr suffix for chromosome input (1 -> chr1), and it also - returns weird chromosomes sometimes (chr22 -> 22_KI270879v1_alt) - """ - return i.split('_')[0] - - -def _create_liftover(chain_dir: str) -> dict['str': pyliftover.LiftOver]: - """ Create LiftOver objects that can remap genomic coordinates """ - builds: list[str] = ["hg19hg38", "hg38hg19"] - chains: list[str] = [os.path.join(chain_dir, x) for x in ["hg19ToHg38.over.chain.gz", "hg38ToHg19.over.chain.gz"]] - lo: list[pyliftover.LiftOver] = [pyliftover.LiftOver(x) for x in chains] - logger.debug("Chain files loaded for liftover") - return dict(zip(builds, lo)) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 68e511c..d40808f 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -1,92 +1,29 @@ import logging -import pandas as pd - logger = logging.getLogger(__name__) - -def quality_control(df: pd.DataFrame, drop_missing: bool) -> pd.DataFrame: - """ Do quality control checks on a scorefile """ - _check_shape(df) - _check_columns(df) - logger.debug("Quality control: checking for bad variants") - if drop_missing is True: - return (df.pipe(_drop_hla) - .pipe(_drop_missing_variants) - .pipe(_check_duplicate_identifiers) - .pipe(_drop_multiple_oa)) - else: - return (df.pipe(_check_duplicate_identifiers) - .pipe(_drop_multiple_oa)) - - -def _drop_multiple_oa(df: pd.DataFrame) -> pd.DataFrame: - """ Set alleles to None in hm_inferOtherAllele if they contain multiple alleles - - e.g. A / C / T -> None; A -> A; A / C -> None - """ - if 'other_allele' in df: - if df['other_allele'].str.contains('/').any(): - logger.debug("Multiple inferred other alleles detected, dropping other alleles for ambiguous variants") - df['other_allele'] = df['other_allele'].replace(regex='.+\\/.+', value=None) - return df +def drop_hla(variants): + logger.info("Checking for HLA alleles") + for variant in variants: + if variant['effect_allele'] != 'P' or variant['effect_allele'] != 'N': + yield variant else: - logger.debug("Only single other alleles detected.") - return df - else: - logger.warning("No other allele data detected, skipping QC of other allele") - return df - - -def _drop_missing_variants(df: pd.DataFrame) -> pd.DataFrame: - no_na: pd.DataFrame = df.dropna(subset=['chr_name', 'chr_position', 'effect_weight']) - n_dropped = df.shape[0] - no_na.shape[0] - - if n_dropped > 0: - logger.warning(f"{n_dropped} variants with missing values detected and dropped from scoring file") - - return no_na - - -def _drop_hla(df: pd.DataFrame) -> pd.DataFrame: - """ Drop HLA effect alleles with present / absent encoding """ - - no_hla: pd.DataFrame = df.query('effect_allele != "P" | effect_allele != "N"') - - if df.shape[0] > no_hla.shape[0]: - logger.debug("HLA alleles detected and dropped") - - return no_hla - - -def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame: - if 'other_allele' in df: - logger.debug("Other allele column detected, including other_allele in variant identifier") - group_cols = ['chr_name', 'chr_position', 'effect_allele', 'other_allele'] - else: - logger.warning("Other allele column not detected, dropping other_allele from variant identifier.") - group_cols = ['chr_name', 'chr_position', 'effect_allele'] - - u_count: pd.Series = df.groupby(group_cols).size() - - if all(u_count == 1): - return df.assign(is_duplicated=False) - else: - logger.warning("Duplicate variants in scoring file: {}".format(df['filename_prefix'].unique())) - u_count = u_count > 1 - u_count.name = 'is_duplicated' - df = pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True) - df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False # handles variants with null chr/pos - return df + logger.warning("HLA alleles detected and dropped") -def _check_shape(df: pd.DataFrame) -> None: - assert len(df.columns) > 1, "ERROR: scorefile not formatted correctly (0 columns)" - assert df.shape[0] > 0, "ERROR: No variants detected in input file (0 rows)" +def check_effect_weight(variants): + # don't actually use converted value + for variant in variants: + try: + float(variant['effect_weight']) + except ValueError: + logger.critical(f"{variant} has bad effect weight") + raise ValueError + yield variant -def _check_columns(df: pd.DataFrame) -> None: - assert {'chr_name', 'chr_position'}.issubset(df.columns), "Missing chromosomal positions. If you're " \ - "using PGS Catalog files with rsIDs you should request " \ - "harmonised data files (HmPOS) instead." - assert 'effect_allele' in df, "ERROR: Missing effect allele column" +def assign_other_allele(variants): + for variant in variants: + if 'other_allele' not in variant: + variant['other_allele'] = None + yield variant diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py deleted file mode 100644 index dbd559b..0000000 --- a/pgscatalog_utils/scorefile/read.py +++ /dev/null @@ -1,79 +0,0 @@ -import gzip -import io -import logging -import os - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def load_scorefile(path: str) -> tuple[dict, pd.DataFrame]: - logger.debug(f'Reading scorefile {path}') - df = pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False) - return (_read_header(path), - df.assign(filename_prefix=get_scorefile_basename(path), filename=path, row_nr=df.index)) - - -def _read_header(path: str) -> dict: - """Parses the header of a PGS Catalog format scorefle into a dictionary""" - f = io.TextIOWrapper(gzip.open(path, 'r')) - try: - f.readline() - except gzip.BadGzipFile: - f = open(path, 'r') - - header = {} - lastline = '#' - while lastline.startswith('#'): - lastline = f.readline() - line = lastline.strip() - if line.startswith('#'): - if '=' in line: - line = line[1:].split('=') - field, val = [x.strip() for x in line] - if field in remap_header: - header[remap_header[field]] = val - else: - header[field] = val - - if ('genome_build' in header) and (header['genome_build'] == 'NR'): - header['genome_build'] = None - f.close() - return header - - -def _scorefile_dtypes() -> dict[str]: - """ Data types for columns that might be found in a scorefile """ - return {'rsID': str, 'chr_name': str, 'chr_position': pd.UInt64Dtype(), 'effect_allele': 'str', - 'effect_weight': float, 'locus_name': str, 'OR': float, 'hm_source': str, 'hm_rsID': str, - 'hm_chr': str, 'hm_pos': pd.UInt64Dtype(), 'hm_inferOtherAllele': str} - - -def get_scorefile_basename(path: str) -> str: - """ Return the basename of a scoring file without extension """ - filename = os.path.basename(path) - if filename.endswith('.txt.gz'): - filename = filename.replace('.txt.gz', '') - elif filename.endswith('.txt'): - filename = filename.replace('.txt', '') - return filename - - -remap_header = { - 'PGS ID': 'pgs_id', - 'PGS Name': 'pgs_name', - 'Reported Trait': 'trait_reported', - 'Original Genome Build': 'genome_build', - 'Number of Variants': 'variants_number', - 'PGP ID': 'pgp_id', - 'Citation': 'citation', - 'LICENSE': 'license', - # Harmonization related - 'HmPOS Build': 'HmPOS_build', - 'HmPOS Date': 'HmPOS_date', - 'HmVCF Reference': 'HmVCF_ref', - 'HmVCF Date': 'HmVCF_date', - 'HmVCF N Matched Variants': 'HmVCF_n_matched', - 'HmVCF N Unmapped Variants': 'HmVCF_n_unmapped' -} # Used to maintain reverse compatibility to old scoring files diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py new file mode 100644 index 0000000..7f9c864 --- /dev/null +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -0,0 +1,144 @@ +import csv +import gzip +import logging +import os +import pathlib +import typing +from dataclasses import dataclass + +from pgscatalog_utils.download.GenomeBuild import GenomeBuild +from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open +from pgscatalog_utils.scorefile.qc import drop_hla, check_effect_weight, \ + assign_other_allele + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +@dataclass +class ScoringFile: + path: pathlib.Path + name: str + header: typing.Union[ScoringFileHeader, None] + genome_build: typing.Union[GenomeBuild, None] + harmonised: bool + fields: list[str] + variants: typing.Generator + + def __post_init__(self): + if self.header.HmPOS_build: + logger.info( + f"{self.path} harmonised data detected: {self.header.HmPOS_build}") + self.genome_build = self.header.HmPOS_build + + mandatory_columns = {'chr_name', 'effect_allele', 'effect_weight'} + if not mandatory_columns.issubset(self.fields) not in self.fields: + err_msg = f"{self.path} missing fields" + raise Exception(err_msg) + + @classmethod + def from_path(cls, path: pathlib.Path): + header = ScoringFileHeader.from_path(path) + if header: + name = header.pgs_id + if header.HmPOS_build: + harmonised = True + genome_build = header.HmPOS_build + else: + harmonised = False + genome_build = header.genome_build + else: + harmonised = False + genome_build = None + name = os.path.basename(path).split('.')[0] + + start_line, cols = get_columns(path) + + # generate variants (a list of dicts, one for each variants) + variants = ScoringFile.read_variants(path=path, start_line=start_line, + fields=cols, name=name) + + # note: these generator expressions aren't doing a bunch of iterations + # it's just a data processing pipeline + variants = remap_harmonised(variants) + + # quality control + variants = drop_hla(variants) + variants = check_effect_weight(variants) + variants = assign_other_allele(variants) + + return cls(path=path, header=header, genome_build=genome_build, + harmonised=harmonised, + fields=cols, + variants=variants, + name=name) + + @staticmethod + def read_variants(path, fields, start_line, name: str): + open_function = auto_open(path) + with open_function(path, 'rt') as f: + logger.info(f"Generating variants from {path}") + csv_reader = csv.reader(f, delimiter='\t') + for i, row in enumerate(csv_reader): + if i > start_line: + variant = dict(zip(fields, row)) | {'name': name} + keys = ["chr_name", "chr_position", "effect_allele", "other_allele", + "effect_weight", "hm_chr", "hm_pos", "hm_inferOtherAllele", + "name", "is_dominant", "is_recessive"] + yield {k: variant[k] for k in keys if k in variant} + + @staticmethod + def write_combined(scoring_files, out_path): + if out_path.endswith("gz"): + open_function = gzip.open + else: + open_function = open + + with open_function(out_path, 'wt') as f: + fieldnames = ["name", "chr_name", "chr_position", "effect_allele", + "other_allele", + "effect_weight"] + writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t') + writer.writeheader() + + # write out in chunks for compression efficiency and speed + chunk_size = 10000 + chunk = [] + for scoring_file in scoring_files: + logger.info(f"Writing variants from scoring file {scoring_file}") + for variant in scoring_file.variants: + chunk.append(variant) + if len(chunk) == chunk_size: + writer.writerows(chunk) + chunk = [] + # handle last chunk + if chunk: + writer.writerows(chunk) + + +def remap_harmonised(variants): + logger.info("Using harmonised data if available") + for variant in variants: + # _always_ use harmonised information, even if missing + if 'hm_chr' in variant: + variant['chr_name'] = variant['hm_chr'] + + if 'hm_pos' in variant: + variant['chr_position'] = variant['hm_pos'] + + if 'hm_inferOtherAllele' in variant and variant.get('other_allele') is None: + logger.warning("Replacing missing other_allele with inferred other allele") + variant['other_allele'] = variant['hm_inferOtherAllele'] + + yield {k: v for k, v in variant.items() if not k.startswith("hm")} + + +def get_columns(path) -> tuple[int, list[str]]: + open_function = auto_open(path) + with open_function(path, 'rt') as f: + for i, line in enumerate(f): + if line.startswith('#'): + continue + return i, line.strip().split('\t') + + diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py deleted file mode 100644 index 8a3233b..0000000 --- a/pgscatalog_utils/scorefile/write.py +++ /dev/null @@ -1,43 +0,0 @@ -import logging -import os - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def write_scorefile(df: pd.DataFrame, path: str) -> None: - cols: list[str] = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', - 'is_duplicated', 'accession', 'row_nr'] - - if os.path.exists(path): - logger.debug("Output file exists: setting write mode to append") - write_mode = 'a' - header = False - else: - logger.debug("Output file doesn't exist: setting write mode to write (create new file)") - write_mode = 'w' - header = True - - out_df: pd.DataFrame = (df.drop('accession', axis=1) - .rename({'filename_prefix': 'accession'}, axis=1) - .pipe(_filter_failed_liftover)) - - if 'other_allele' not in out_df: - logger.warning("No other allele information detected, writing out as missing data") - out_df['other_allele'] = None - - if path.endswith('.gz'): - logger.debug("Writing out gzip-compressed combined scorefile") - out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip', mode=write_mode, header=header) - else: - logger.debug("Writing out combined scorefile") - out_df[cols].to_csv(path, index=False, sep="\t", mode=write_mode, header=header) - - -def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame: - if 'liftover' in df: - logger.debug("Filtering variants that failed liftover") - return df.query('liftover == True') - else: - return df From 65b2b798e8187e5de7b478a91c4c28db591c4945 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 30 Oct 2023 17:16:14 +0000 Subject: [PATCH 02/40] set up effect types --- pgscatalog_utils/scorefile/qc.py | 46 ++++++++++++++++++++++- pgscatalog_utils/scorefile/scoringfile.py | 31 +++------------ 2 files changed, 50 insertions(+), 27 deletions(-) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index d40808f..2e51008 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -1,5 +1,6 @@ import logging + logger = logging.getLogger(__name__) def drop_hla(variants): @@ -12,10 +13,10 @@ def drop_hla(variants): def check_effect_weight(variants): - # don't actually use converted value + logger.info("Checking effect weights") for variant in variants: try: - float(variant['effect_weight']) + variant['effect_weight'] = float(variant['effect_weight']) except ValueError: logger.critical(f"{variant} has bad effect weight") raise ValueError @@ -27,3 +28,44 @@ def assign_other_allele(variants): if 'other_allele' not in variant: variant['other_allele'] = None yield variant + +def assign_effect_type(variants): + logger.info("Assigning effect types") + for variant in variants: + if 'is_recessive' not in variant and 'is_dominant' not in variant: + variant['effect_type'] = 'additive' + + if 'is_recessive' in variant or 'is_dominant' in variant: + logger.info("Recessive or dominant variant detected") + if variant['is_recessive']: + variant['effect_type'] = 'recessive' + elif variant['is_dominant']: + variant['effect_type'] = 'dominant' + elif variant['is_recessive'] and variant['is_dominant']: + logger.critical(f"Bad effect type setting: {variant}") + raise Exception + + variant.pop('is_recessive') + variant.pop('is_dominant') + + yield variant + + +def remap_harmonised(variants, harmonised: bool): + if harmonised: + logger.info("Using harmonised data fields") + else: + logger.info("Harmonised data fields not available") + + for variant in variants: + if harmonised: + variant['chr_name'] = variant['hm_chr'] + variant['chr_position'] = variant['hm_pos'] + + if 'hm_inferOtherAllele' in variant and variant.get('other_allele') is None: + logger.debug("Replacing missing other_allele with inferred other allele") + variant['other_allele'] = variant['hm_inferOtherAllele'] + + yield {k: v for k, v in variant.items() if not k.startswith("hm")} + else: + yield variant diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index 7f9c864..db573e9 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -9,7 +9,7 @@ from pgscatalog_utils.download.GenomeBuild import GenomeBuild from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open from pgscatalog_utils.scorefile.qc import drop_hla, check_effect_weight, \ - assign_other_allele + assign_other_allele, assign_effect_type, remap_harmonised logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -55,15 +55,15 @@ def from_path(cls, path: pathlib.Path): start_line, cols = get_columns(path) # generate variants (a list of dicts, one for each variants) + logger.info(f"Lazily reading variants from {path}") variants = ScoringFile.read_variants(path=path, start_line=start_line, fields=cols, name=name) # note: these generator expressions aren't doing a bunch of iterations # it's just a data processing pipeline - variants = remap_harmonised(variants) - - # quality control + variants = remap_harmonised(variants, harmonised) variants = drop_hla(variants) + variants = assign_effect_type(variants) variants = check_effect_weight(variants) variants = assign_other_allele(variants) @@ -77,7 +77,6 @@ def from_path(cls, path: pathlib.Path): def read_variants(path, fields, start_line, name: str): open_function = auto_open(path) with open_function(path, 'rt') as f: - logger.info(f"Generating variants from {path}") csv_reader = csv.reader(f, delimiter='\t') for i, row in enumerate(csv_reader): if i > start_line: @@ -96,8 +95,7 @@ def write_combined(scoring_files, out_path): with open_function(out_path, 'wt') as f: fieldnames = ["name", "chr_name", "chr_position", "effect_allele", - "other_allele", - "effect_weight"] + "other_allele", "effect_weight", "effect_type"] writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t') writer.writeheader() @@ -105,7 +103,7 @@ def write_combined(scoring_files, out_path): chunk_size = 10000 chunk = [] for scoring_file in scoring_files: - logger.info(f"Writing variants from scoring file {scoring_file}") + logger.info(f"Writing {scoring_file.name} variants") for variant in scoring_file.variants: chunk.append(variant) if len(chunk) == chunk_size: @@ -116,23 +114,6 @@ def write_combined(scoring_files, out_path): writer.writerows(chunk) -def remap_harmonised(variants): - logger.info("Using harmonised data if available") - for variant in variants: - # _always_ use harmonised information, even if missing - if 'hm_chr' in variant: - variant['chr_name'] = variant['hm_chr'] - - if 'hm_pos' in variant: - variant['chr_position'] = variant['hm_pos'] - - if 'hm_inferOtherAllele' in variant and variant.get('other_allele') is None: - logger.warning("Replacing missing other_allele with inferred other allele") - variant['other_allele'] = variant['hm_inferOtherAllele'] - - yield {k: v for k, v in variant.items() if not k.startswith("hm")} - - def get_columns(path) -> tuple[int, list[str]]: open_function = auto_open(path) with open_function(path, 'rt') as f: From 227f32f31611b03f81596908ae01d669cbd1e17f Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 31 Oct 2023 14:14:42 +0000 Subject: [PATCH 03/40] profiling improvements --- .../scorefile/combine_scorefiles.py | 10 +++- pgscatalog_utils/scorefile/config.py | 4 +- pgscatalog_utils/scorefile/header.py | 14 ++++- pgscatalog_utils/scorefile/qc.py | 22 ++++---- pgscatalog_utils/scorefile/scoringfile.py | 56 ++++++------------- pgscatalog_utils/scorefile/write.py | 42 ++++++++++++++ 6 files changed, 92 insertions(+), 56 deletions(-) create mode 100644 pgscatalog_utils/scorefile/write.py diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 7dd5fc8..82ed66e 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -6,7 +6,9 @@ from pgscatalog_utils.config import set_logging_level from pgscatalog_utils.download.GenomeBuild import GenomeBuild +from pgscatalog_utils.scorefile.config import Config from pgscatalog_utils.scorefile.scoringfile import ScoringFile +from pgscatalog_utils.scorefile.write import write_combined def combine_scorefiles(): @@ -15,6 +17,9 @@ def combine_scorefiles(): logger = logging.getLogger(__name__) set_logging_level(args.verbose) + Config.threads = args.threads + Config.batch_size = 20000 + paths: list[str] = list(set(args.scorefiles)) # unique paths only logger.debug(f"Input scorefiles: {paths}") @@ -29,7 +34,8 @@ def combine_scorefiles(): else: logger.info(f"All builds match target build {target_build}") - ScoringFile.write_combined(sfs, args.outfile) + write_combined(sfs, args.outfile) + end_time = time.time() elapsed_time = end_time - start_time print(f"Elapsed time: {elapsed_time} seconds") @@ -75,6 +81,8 @@ def _parse_args(args=None) -> argparse.Namespace: parser.add_argument('-m', '--min_lift', dest='min_lift', help=' If liftover, minimum proportion of variants lifted over', required="--liftover" in sys.argv, default=0.95, type=float) + parser.add_argument('--threads', dest='threads', required=False, + default=1, type=int) parser.add_argument('--drop_missing', dest='drop_missing', action='store_true', help=' Drop variants with missing information (chr/pos) and ' 'non-standard alleles (e.g. HLA=P/N) from the output file.') diff --git a/pgscatalog_utils/scorefile/config.py b/pgscatalog_utils/scorefile/config.py index 8dee364..10bc5b3 100644 --- a/pgscatalog_utils/scorefile/config.py +++ b/pgscatalog_utils/scorefile/config.py @@ -3,7 +3,9 @@ @dataclass class Config: + threads: int drop_missing: bool liftover: bool chain_dir: str - min_lift: float \ No newline at end of file + min_lift: float + batch_size: int \ No newline at end of file diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py index 7fc0e4e..bf9447b 100644 --- a/pgscatalog_utils/scorefile/header.py +++ b/pgscatalog_utils/scorefile/header.py @@ -2,6 +2,9 @@ import pathlib from dataclasses import dataclass +from pgscatalog_utils.scorefile.config import Config +from pgzip import pgzip + from pgscatalog_utils.download.GenomeBuild import GenomeBuild @@ -75,6 +78,13 @@ def _gen_header_lines(f): def auto_open(filepath): with open(filepath, 'rb') as test_f: if test_f.read(2) == b'\x1f\x8b': - return gzip.open + gzipped = True else: - return open + gzipped = False + + if gzipped and Config.threads > 1: + return gzip.open + elif gzipped: + return pgzip.open + elif not gzipped: + return open \ No newline at end of file diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 2e51008..69eb660 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -3,8 +3,16 @@ logger = logging.getLogger(__name__) +def quality_control(variants, harmonised): + variants = remap_harmonised(variants, harmonised) + variants = drop_hla(variants) + variants = assign_effect_type(variants) + variants = check_effect_weight(variants) + variants = assign_other_allele(variants) + return variants + + def drop_hla(variants): - logger.info("Checking for HLA alleles") for variant in variants: if variant['effect_allele'] != 'P' or variant['effect_allele'] != 'N': yield variant @@ -13,7 +21,6 @@ def drop_hla(variants): def check_effect_weight(variants): - logger.info("Checking effect weights") for variant in variants: try: variant['effect_weight'] = float(variant['effect_weight']) @@ -30,7 +37,6 @@ def assign_other_allele(variants): yield variant def assign_effect_type(variants): - logger.info("Assigning effect types") for variant in variants: if 'is_recessive' not in variant and 'is_dominant' not in variant: variant['effect_type'] = 'additive' @@ -45,18 +51,10 @@ def assign_effect_type(variants): logger.critical(f"Bad effect type setting: {variant}") raise Exception - variant.pop('is_recessive') - variant.pop('is_dominant') - yield variant def remap_harmonised(variants, harmonised: bool): - if harmonised: - logger.info("Using harmonised data fields") - else: - logger.info("Harmonised data fields not available") - for variant in variants: if harmonised: variant['chr_name'] = variant['hm_chr'] @@ -66,6 +64,6 @@ def remap_harmonised(variants, harmonised: bool): logger.debug("Replacing missing other_allele with inferred other allele") variant['other_allele'] = variant['hm_inferOtherAllele'] - yield {k: v for k, v in variant.items() if not k.startswith("hm")} + yield variant else: yield variant diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index db573e9..54fed17 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -1,15 +1,16 @@ import csv -import gzip import logging import os import pathlib import typing from dataclasses import dataclass +from itertools import islice + +from pgscatalog_utils.scorefile.config import Config from pgscatalog_utils.download.GenomeBuild import GenomeBuild from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open -from pgscatalog_utils.scorefile.qc import drop_hla, check_effect_weight, \ - assign_other_allele, assign_effect_type, remap_harmonised +from pgscatalog_utils.scorefile.qc import quality_control logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -61,11 +62,7 @@ def from_path(cls, path: pathlib.Path): # note: these generator expressions aren't doing a bunch of iterations # it's just a data processing pipeline - variants = remap_harmonised(variants, harmonised) - variants = drop_hla(variants) - variants = assign_effect_type(variants) - variants = check_effect_weight(variants) - variants = assign_other_allele(variants) + variants = quality_control(variants, harmonised) return cls(path=path, header=header, genome_build=genome_build, harmonised=harmonised, @@ -77,42 +74,23 @@ def from_path(cls, path: pathlib.Path): def read_variants(path, fields, start_line, name: str): open_function = auto_open(path) with open_function(path, 'rt') as f: - csv_reader = csv.reader(f, delimiter='\t') - for i, row in enumerate(csv_reader): - if i > start_line: + for _ in range(start_line + 1): + # skip header + next(f) + + while True: + batch = list(islice(f, Config.batch_size)) + if not batch: + break + + csv_reader = csv.reader(batch, delimiter='\t') + for i, row in enumerate(csv_reader): variant = dict(zip(fields, row)) | {'name': name} keys = ["chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "hm_chr", "hm_pos", "hm_inferOtherAllele", "name", "is_dominant", "is_recessive"] yield {k: variant[k] for k in keys if k in variant} - @staticmethod - def write_combined(scoring_files, out_path): - if out_path.endswith("gz"): - open_function = gzip.open - else: - open_function = open - - with open_function(out_path, 'wt') as f: - fieldnames = ["name", "chr_name", "chr_position", "effect_allele", - "other_allele", "effect_weight", "effect_type"] - writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t') - writer.writeheader() - - # write out in chunks for compression efficiency and speed - chunk_size = 10000 - chunk = [] - for scoring_file in scoring_files: - logger.info(f"Writing {scoring_file.name} variants") - for variant in scoring_file.variants: - chunk.append(variant) - if len(chunk) == chunk_size: - writer.writerows(chunk) - chunk = [] - # handle last chunk - if chunk: - writer.writerows(chunk) - def get_columns(path) -> tuple[int, list[str]]: open_function = auto_open(path) @@ -121,5 +99,3 @@ def get_columns(path) -> tuple[int, list[str]]: if line.startswith('#'): continue return i, line.strip().split('\t') - - diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py new file mode 100644 index 0000000..df17bdc --- /dev/null +++ b/pgscatalog_utils/scorefile/write.py @@ -0,0 +1,42 @@ +import csv +import functools +import gzip +import logging +from itertools import islice + +import pgzip + +from pgscatalog_utils.scorefile.config import Config +from pgscatalog_utils.scorefile.scoringfile import ScoringFile + +logger = logging.getLogger(__name__) + + +def write_combined(scoring_files: list[ScoringFile], out_path: str): + # compresslevel can be really slow, default is 9 + if out_path.endswith("gz") and Config.threads == 1: + logger.info("Writing with gzip (slow)") + open_function = functools.partial(gzip.open, compresslevel=6) + elif Config.threads > 1: + logger.info("Writing with pgzip (fast)") + open_function = functools.partial(pgzip.open, compresslevel=6, + thread=Config.threads, blocksize=2 * 10 ** 8) + else: + logger.info("Writing text file (fast)") + open_function = open + + with open_function(out_path, mode='wt') as f: + fieldnames = ["name", "chr_name", "chr_position", "effect_allele", + "other_allele", "effect_weight", "effect_type"] + writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t', + extrasaction='ignore') + writer.writeheader() + + # write out in batches for compression efficiency and speed + for scoring_file in scoring_files: + logger.info(f"Writing {scoring_file.name} variants") + while True: + batch = list(islice(scoring_file.variants, Config.batch_size)) + if not batch: + break + writer.writerows(batch) From 1349d0aff27fe430b5f99685201ba7fc70c43336 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 31 Oct 2023 14:58:45 +0000 Subject: [PATCH 04/40] fix output --- pgscatalog_utils/scorefile/combine_scorefiles.py | 2 +- pgscatalog_utils/scorefile/header.py | 5 +++-- pgscatalog_utils/scorefile/scoringfile.py | 15 +++++++-------- pgscatalog_utils/scorefile/write.py | 6 +++--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 82ed66e..6650459 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -27,7 +27,7 @@ def combine_scorefiles(): sfs = [ScoringFile.from_path(x) for x in paths] target_build = GenomeBuild.from_string(args.target_build) - bad_builds = [x.name for x in sfs if x.genome_build != target_build] + bad_builds = [x.accession for x in sfs if x.genome_build != target_build] for bad_file in bad_builds: logger.critical(f"{bad_file} doesn't match {target_build}, can't combine") raise Exception diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py index bf9447b..78259c6 100644 --- a/pgscatalog_utils/scorefile/header.py +++ b/pgscatalog_utils/scorefile/header.py @@ -1,3 +1,4 @@ +import functools import gzip import pathlib from dataclasses import dataclass @@ -83,8 +84,8 @@ def auto_open(filepath): gzipped = False if gzipped and Config.threads > 1: - return gzip.open + return functools.partial(pgzip.open, thread=Config.threads) elif gzipped: - return pgzip.open + return gzip.open elif not gzipped: return open \ No newline at end of file diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index 54fed17..4122022 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -19,7 +19,7 @@ @dataclass class ScoringFile: path: pathlib.Path - name: str + accession: str header: typing.Union[ScoringFileHeader, None] genome_build: typing.Union[GenomeBuild, None] harmonised: bool @@ -40,8 +40,8 @@ def __post_init__(self): @classmethod def from_path(cls, path: pathlib.Path): header = ScoringFileHeader.from_path(path) + name = os.path.basename(path).split('.')[0] if header: - name = header.pgs_id if header.HmPOS_build: harmonised = True genome_build = header.HmPOS_build @@ -51,7 +51,6 @@ def from_path(cls, path: pathlib.Path): else: harmonised = False genome_build = None - name = os.path.basename(path).split('.')[0] start_line, cols = get_columns(path) @@ -68,12 +67,12 @@ def from_path(cls, path: pathlib.Path): harmonised=harmonised, fields=cols, variants=variants, - name=name) + accession=name) @staticmethod def read_variants(path, fields, start_line, name: str): open_function = auto_open(path) - with open_function(path, 'rt') as f: + with open_function(path, mode='rt') as f: for _ in range(start_line + 1): # skip header next(f) @@ -85,16 +84,16 @@ def read_variants(path, fields, start_line, name: str): csv_reader = csv.reader(batch, delimiter='\t') for i, row in enumerate(csv_reader): - variant = dict(zip(fields, row)) | {'name': name} + variant = dict(zip(fields, row)) | {'accession': name, "row_nr": i } keys = ["chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "hm_chr", "hm_pos", "hm_inferOtherAllele", - "name", "is_dominant", "is_recessive"] + "is_dominant", "is_recessive", "accession", "row_nr"] yield {k: variant[k] for k in keys if k in variant} def get_columns(path) -> tuple[int, list[str]]: open_function = auto_open(path) - with open_function(path, 'rt') as f: + with open_function(path, mode='rt') as f: for i, line in enumerate(f): if line.startswith('#'): continue diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index df17bdc..9345a87 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -26,15 +26,15 @@ def write_combined(scoring_files: list[ScoringFile], out_path: str): open_function = open with open_function(out_path, mode='wt') as f: - fieldnames = ["name", "chr_name", "chr_position", "effect_allele", - "other_allele", "effect_weight", "effect_type"] + fieldnames = ["chr_name", "chr_position", "effect_allele", + "other_allele", "effect_weight", "effect_type", "accession", "row_nr"] writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore') writer.writeheader() # write out in batches for compression efficiency and speed for scoring_file in scoring_files: - logger.info(f"Writing {scoring_file.name} variants") + logger.info(f"Writing {scoring_file.accession} variants") while True: batch = list(islice(scoring_file.variants, Config.batch_size)) if not batch: From 669eb8d76bedaf4544cf21f6eead9463e5aeba6d Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 1 Nov 2023 11:10:18 +0000 Subject: [PATCH 05/40] check for duplicates --- pgscatalog_utils/scorefile/qc.py | 47 +++++++++++++++++++++-- pgscatalog_utils/scorefile/scoringfile.py | 6 ++- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 69eb660..a146dcc 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -1,17 +1,45 @@ import logging - logger = logging.getLogger(__name__) + def quality_control(variants, harmonised): variants = remap_harmonised(variants, harmonised) variants = drop_hla(variants) variants = assign_effect_type(variants) variants = check_effect_weight(variants) variants = assign_other_allele(variants) + variants = check_duplicates(variants) return variants +def check_duplicates(variants): + seen_ids = set() + current_accession = None + + for variant in variants: + accession = variant['accession'] + + if accession != current_accession: + seen_ids = set() + current_accession = accession + + # None other allele -> empty string + id = ":".join([str(variant[k] or "") for k in + ['chr_name', 'chr_position', 'effect_allele', 'other_allele']]) + + if id in seen_ids: + logger.warning( + f"Duplicate variant found: {variant['accession']}: {id} {variant['row_nr']}") + variant['is_duplicated'] = True + else: + variant['is_duplicated'] = False + + seen_ids.add(id) + + yield variant + + def drop_hla(variants): for variant in variants: if variant['effect_allele'] != 'P' or variant['effect_allele'] != 'N': @@ -32,10 +60,16 @@ def check_effect_weight(variants): def assign_other_allele(variants): for variant in variants: + if 'other_allele' in variant: + if '/' in variant['other_allele']: + # drop multiple other alleles + variant['other_allele'] = None + if 'other_allele' not in variant: variant['other_allele'] = None yield variant + def assign_effect_type(variants): for variant in variants: if 'is_recessive' not in variant and 'is_dominant' not in variant: @@ -57,11 +91,16 @@ def assign_effect_type(variants): def remap_harmonised(variants, harmonised: bool): for variant in variants: if harmonised: - variant['chr_name'] = variant['hm_chr'] - variant['chr_position'] = variant['hm_pos'] + # if harmonised data are available, always overwrite + if variant['hm_chr']: + variant['chr_name'] = variant['hm_chr'] + + if variant['hm_pos']: + variant['chr_position'] = variant['hm_pos'] if 'hm_inferOtherAllele' in variant and variant.get('other_allele') is None: - logger.debug("Replacing missing other_allele with inferred other allele") + logger.debug( + "Replacing missing other_allele with inferred other allele") variant['other_allele'] = variant['hm_inferOtherAllele'] yield variant diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index 4122022..0d20df2 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -73,6 +73,7 @@ def from_path(cls, path: pathlib.Path): def read_variants(path, fields, start_line, name: str): open_function = auto_open(path) with open_function(path, mode='rt') as f: + row_nr = 0 # row_nr for _ in range(start_line + 1): # skip header next(f) @@ -83,12 +84,13 @@ def read_variants(path, fields, start_line, name: str): break csv_reader = csv.reader(batch, delimiter='\t') - for i, row in enumerate(csv_reader): - variant = dict(zip(fields, row)) | {'accession': name, "row_nr": i } + for row in csv_reader: + variant = dict(zip(fields, row)) | {'accession': name, "row_nr": row_nr } keys = ["chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "hm_chr", "hm_pos", "hm_inferOtherAllele", "is_dominant", "is_recessive", "accession", "row_nr"] yield {k: variant[k] for k in keys if k in variant} + row_nr += 1 def get_columns(path) -> tuple[int, list[str]]: From bbfffbce90770aac40583e42a80b3f79f67e7281 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 1 Nov 2023 16:12:22 +0000 Subject: [PATCH 06/40] add liftover --- .../scorefile/combine_scorefiles.py | 131 +++++++++++----- pgscatalog_utils/scorefile/config.py | 8 +- pgscatalog_utils/scorefile/header.py | 20 +-- pgscatalog_utils/scorefile/liftover.py | 64 ++++++++ pgscatalog_utils/scorefile/qc.py | 145 ++++++++++++------ pgscatalog_utils/scorefile/scoringfile.py | 64 +++++--- pgscatalog_utils/scorefile/write.py | 24 ++- 7 files changed, 333 insertions(+), 123 deletions(-) create mode 100644 pgscatalog_utils/scorefile/liftover.py diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 6650459..e3ce606 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -7,6 +7,7 @@ from pgscatalog_utils.config import set_logging_level from pgscatalog_utils.download.GenomeBuild import GenomeBuild from pgscatalog_utils.scorefile.config import Config +from pgscatalog_utils.scorefile.liftover import create_liftover from pgscatalog_utils.scorefile.scoringfile import ScoringFile from pgscatalog_utils.scorefile.write import write_combined @@ -19,6 +20,14 @@ def combine_scorefiles(): Config.threads = args.threads Config.batch_size = 20000 + Config.drop_missing = args.drop_missing + Config.target_build = GenomeBuild.from_string(args.target_build) + Config.liftover = args.liftover + Config.min_lift = args.min_lift + + if args.chain_dir: + Config.chain_dir = args.chain_dir + Config.lo = create_liftover() paths: list[str] = list(set(args.scorefiles)) # unique paths only logger.debug(f"Input scorefiles: {paths}") @@ -28,6 +37,7 @@ def combine_scorefiles(): target_build = GenomeBuild.from_string(args.target_build) bad_builds = [x.accession for x in sfs if x.genome_build != target_build] + for bad_file in bad_builds: logger.critical(f"{bad_file} doesn't match {target_build}, can't combine") raise Exception @@ -42,7 +52,8 @@ def combine_scorefiles(): def _description_text() -> str: - return textwrap.dedent('''\ + return textwrap.dedent( + """\ Combine multiple scoring files in PGS Catalog format (see https://www.pgscatalog.org/downloads/ for details) to a 'long' table of columns needed for variant matching and subsequent calculation. @@ -51,50 +62,96 @@ def _description_text() -> str: unharmonised and harmonised PGS Catalog data. By default all variants are output (including positions with duplicated data [often caused by rsID/liftover collions across builds]) and variants with missing positions. - ''') + """ + ) def _epilog_text() -> str: - return textwrap.dedent('''\ + return textwrap.dedent( + """\ The long table is used to simplify intersecting variants in target genotyping datasets and the scoring files with the match_variants program. - ''') + """ + ) def _parse_args(args=None) -> argparse.Namespace: - parser = argparse.ArgumentParser(description=_description_text(), - epilog=_epilog_text(), - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('-s', '--scorefiles', dest='scorefiles', nargs='+', - help=' Scorefile path (wildcard * is OK)', - required=True) - parser.add_argument('--liftover', dest='liftover', - help=' Convert scoring file variants to target genome build?', - action='store_true') - parser.add_argument('-t', '--target_build', dest='target_build', - choices=['GRCh37', 'GRCh38'], - help=' Build of target genome', - required=True) - parser.add_argument('-c', '--chain_dir', dest='chain_dir', - help='Path to directory containing chain files', - required="--liftover" in sys.argv) - parser.add_argument('-m', '--min_lift', dest='min_lift', - help=' If liftover, minimum proportion of variants lifted over', - required="--liftover" in sys.argv, default=0.95, type=float) - parser.add_argument('--threads', dest='threads', required=False, - default=1, type=int) - parser.add_argument('--drop_missing', dest='drop_missing', action='store_true', - help=' Drop variants with missing information (chr/pos) and ' - 'non-standard alleles (e.g. HLA=P/N) from the output file.') - parser.add_argument('-o', '--outfile', dest='outfile', required=True, - default='combined.txt', - help=' Output path to combined long scorefile ' - '[ will compress output if filename ends with .gz ]') - parser.add_argument('-l', '--logfile', dest='logfile', default='log_combined.json', - help=' Name for the log file (score metadata) for combined scores.' - '[ will write to identical directory as combined scorefile]') - parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', - help=' Extra logging information') + parser = argparse.ArgumentParser( + description=_description_text(), + epilog=_epilog_text(), + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "-s", + "--scorefiles", + dest="scorefiles", + nargs="+", + help=" Scorefile path (wildcard * is OK)", + required=True, + ) + parser.add_argument( + "--liftover", + dest="liftover", + help=" Convert scoring file variants to target genome build?", + action="store_true", + ) + parser.add_argument( + "-t", + "--target_build", + dest="target_build", + choices=["GRCh37", "GRCh38"], + help=" Build of target genome", + required=True, + ) + parser.add_argument( + "-c", + "--chain_dir", + dest="chain_dir", + help="Path to directory containing chain files", + required="--liftover" in sys.argv, + ) + parser.add_argument( + "-m", + "--min_lift", + dest="min_lift", + help=" If liftover, minimum proportion of variants lifted over", + default=0.95, + type=float, + ) + parser.add_argument( + "--threads", dest="threads", required=False, default=1, type=int + ) + parser.add_argument( + "--drop_missing", + dest="drop_missing", + action="store_true", + help=" Drop variants with missing information (chr/pos) and " + "non-standard alleles (e.g. HLA=P/N) from the output file.", + ) + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + required=True, + default="combined.txt", + help=" Output path to combined long scorefile " + "[ will compress output if filename ends with .gz ]", + ) + parser.add_argument( + "-l", + "--logfile", + dest="logfile", + default="log_combined.json", + help=" Name for the log file (score metadata) for combined scores." + "[ will write to identical directory as combined scorefile]", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + action="store_true", + help=" Extra logging information", + ) return parser.parse_args(args) diff --git a/pgscatalog_utils/scorefile/config.py b/pgscatalog_utils/scorefile/config.py index 10bc5b3..2725110 100644 --- a/pgscatalog_utils/scorefile/config.py +++ b/pgscatalog_utils/scorefile/config.py @@ -1,11 +1,17 @@ from dataclasses import dataclass +import pyliftover + +from pgscatalog_utils.download.GenomeBuild import GenomeBuild + @dataclass class Config: threads: int drop_missing: bool liftover: bool + lo: pyliftover.liftover chain_dir: str min_lift: float - batch_size: int \ No newline at end of file + batch_size: int + target_build: GenomeBuild diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py index 78259c6..3a5e889 100644 --- a/pgscatalog_utils/scorefile/header.py +++ b/pgscatalog_utils/scorefile/header.py @@ -29,7 +29,7 @@ def __post_init__(self): if self.HmPOS_build: self.HmPOS_build = GenomeBuild.from_string(self.HmPOS_build) - if self.format_version != '2.0': + if self.format_version != "2.0": raise Exception("Only support v2 format") @classmethod @@ -40,9 +40,9 @@ def from_path(cls, path: pathlib.Path): header_dict = {k: raw_header[k] for k in raw_header.keys() & keep_keys} # ... so we can unpack the dict into a dataclass - if len(header_dict) > 1 and 'HmPOS_build' not in header_dict: + if len(header_dict) > 1 and "HmPOS_build" not in header_dict: # working with pgs catalog formatted header but unharmonised data - header_dict['HmPOS_build'] = None + header_dict["HmPOS_build"] = None if header_dict: return ScoringFileHeader(**header_dict) @@ -54,7 +54,7 @@ def from_path(cls, path: pathlib.Path): def raw_header_to_dict(header): d = {} for item in header: - key, value = item.split('=') + key, value = item.split("=") d[key[1:]] = value # drop # character from key return d @@ -62,14 +62,14 @@ def raw_header_to_dict(header): def read_header(path: pathlib.Path): """Parses the header of a PGS Catalog format scorefile into a dictionary""" open_function = auto_open(path) - with open_function(path, 'rt') as f: + with open_function(path, "rt") as f: yield from _gen_header_lines(f) def _gen_header_lines(f): for line in f: - if line.startswith('#'): - if '=' in line: + if line.startswith("#"): + if "=" in line: yield line.strip() else: # stop reading lines @@ -77,8 +77,8 @@ def _gen_header_lines(f): def auto_open(filepath): - with open(filepath, 'rb') as test_f: - if test_f.read(2) == b'\x1f\x8b': + with open(filepath, "rb") as test_f: + if test_f.read(2) == b"\x1f\x8b": gzipped = True else: gzipped = False @@ -88,4 +88,4 @@ def auto_open(filepath): elif gzipped: return gzip.open elif not gzipped: - return open \ No newline at end of file + return open diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py new file mode 100644 index 0000000..9924916 --- /dev/null +++ b/pgscatalog_utils/scorefile/liftover.py @@ -0,0 +1,64 @@ +import logging +import os + +import pyliftover + +from pgscatalog_utils.download.GenomeBuild import GenomeBuild +from pgscatalog_utils.scorefile.config import Config + +logger = logging.getLogger(__name__) + + +def liftover( + variants, harmonised: bool, current_build: GenomeBuild, target_build: GenomeBuild +): + if harmonised: + skip_lo = True + elif target_build == current_build: + skip_lo = True + else: + skip_lo = False + + if skip_lo: + for variant in variants: + yield variant + else: + if current_build == GenomeBuild.GRCh37 and target_build == GenomeBuild.GRCh38: + lo: pyliftover.LiftOver = Config.lo["hg19hg38"] + elif current_build == GenomeBuild.GRCh38 and target_build == GenomeBuild.GRCh37: + lo: pyliftover.LiftOver = Config.lo["hg19hg38"] + else: + raise Exception("Can't get pyliftover object") + + n_lifted = 0 + n = 0 + + for variant in variants: + chrom = "chr" + variant["chr_name"] + pos = int(variant["chr_position"]) - 1 # VCF -> 1 based, UCSC -> 0 based + lifted = lo.convert_coordinate(chrom, pos) + if lifted: + variant["chr_name"] = lifted[0][0][3:].split("_")[0] + variant["chr_position"] = lifted[0][1] + 1 # reverse 0 indexing + n_lifted += 1 + yield variant + n += 1 + + if (n_lifted / n) < Config.min_lift: + logger.error(f"Liftover failed") + raise Exception + else: + logger.info("Liftover successful") + + +def create_liftover() -> dict["str" : pyliftover.LiftOver]: + """Create LiftOver objects that can remap genomic coordinates""" + chain_dir: str = Config.chain_dir + builds: list[str] = ["hg19hg38", "hg38hg19"] + chains: list[str] = [ + os.path.join(chain_dir, x) + for x in ["hg19ToHg38.over.chain.gz", "hg38ToHg19.over.chain.gz"] + ] + lo: list[pyliftover.LiftOver] = [pyliftover.LiftOver(x) for x in chains] + logger.debug("Chain files loaded for liftover") + return dict(zip(builds, lo)) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index a146dcc..44d907f 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -1,57 +1,87 @@ import logging +import typing + +from pgscatalog_utils.scorefile.config import Config +from pgscatalog_utils.scorefile.header import ScoringFileHeader +from pgscatalog_utils.scorefile.liftover import liftover logger = logging.getLogger(__name__) -def quality_control(variants, harmonised): +def quality_control(variants, header: ScoringFileHeader, harmonised: bool): variants = remap_harmonised(variants, harmonised) - variants = drop_hla(variants) + + if Config.drop_missing: + variants = drop_hla(variants) + variants = assign_effect_type(variants) variants = check_effect_weight(variants) variants = assign_other_allele(variants) variants = check_duplicates(variants) + + if Config.liftover: + variants = liftover( + variants, + harmonised=harmonised, + current_build=header.genome_build, + target_build=Config.target_build, + ) + return variants def check_duplicates(variants): - seen_ids = set() - current_accession = None - + seen_ids: dict = {} + current_accession: typing.Union[str, None] = None + n_duplicates: int = 0 + n_variants: int = 0 for variant in variants: - accession = variant['accession'] + accession: str = variant["accession"] if accession != current_accession: - seen_ids = set() + seen_ids = {} current_accession = accession # None other allele -> empty string - id = ":".join([str(variant[k] or "") for k in - ['chr_name', 'chr_position', 'effect_allele', 'other_allele']]) + id: str = ":".join( + [ + str(variant[k] or "") + for k in ["chr_name", "chr_position", "effect_allele", "other_allele"] + ] + ) if id in seen_ids: - logger.warning( - f"Duplicate variant found: {variant['accession']}: {id} {variant['row_nr']}") - variant['is_duplicated'] = True + variant["is_duplicated"] = True + n_duplicates += 1 else: - variant['is_duplicated'] = False + variant["is_duplicated"] = False - seen_ids.add(id) + seen_ids[id] = True yield variant + n_variants += 1 + + if n_duplicates > 0: + logger.warning( + f"{n_duplicates} of {n_variants} variants are duplicated in: {current_accession}" + ) def drop_hla(variants): + n_dropped = 0 for variant in variants: - if variant['effect_allele'] != 'P' or variant['effect_allele'] != 'N': + if variant["effect_allele"] != "P" or variant["effect_allele"] != "N": yield variant else: - logger.warning("HLA alleles detected and dropped") + n_dropped += 1 + + logger.warning(f"{n_dropped} HLA alleles detected and dropped") def check_effect_weight(variants): for variant in variants: try: - variant['effect_weight'] = float(variant['effect_weight']) + variant["effect_weight"] = float(variant["effect_weight"]) except ValueError: logger.critical(f"{variant} has bad effect weight") raise ValueError @@ -59,29 +89,35 @@ def check_effect_weight(variants): def assign_other_allele(variants): + n_dropped = 0 for variant in variants: - if 'other_allele' in variant: - if '/' in variant['other_allele']: + if "other_allele" in variant: + if "/" in variant["other_allele"]: # drop multiple other alleles - variant['other_allele'] = None + n_dropped += 1 + variant["other_allele"] = None + else: + variant["other_allele"] = None - if 'other_allele' not in variant: - variant['other_allele'] = None yield variant + if n_dropped > 0: + logger.warning(f"Multiple other_alleles detected in {n_dropped} variants") + logger.warning("Other allele for these variants is set to missing") + def assign_effect_type(variants): for variant in variants: - if 'is_recessive' not in variant and 'is_dominant' not in variant: - variant['effect_type'] = 'additive' + if "is_recessive" not in variant and "is_dominant" not in variant: + variant["effect_type"] = "additive" - if 'is_recessive' in variant or 'is_dominant' in variant: + if "is_recessive" in variant or "is_dominant" in variant: logger.info("Recessive or dominant variant detected") - if variant['is_recessive']: - variant['effect_type'] = 'recessive' - elif variant['is_dominant']: - variant['effect_type'] = 'dominant' - elif variant['is_recessive'] and variant['is_dominant']: + if variant["is_recessive"]: + variant["effect_type"] = "recessive" + elif variant["is_dominant"]: + variant["effect_type"] = "dominant" + elif variant["is_recessive"] and variant["is_dominant"]: logger.critical(f"Bad effect type setting: {variant}") raise Exception @@ -89,20 +125,37 @@ def assign_effect_type(variants): def remap_harmonised(variants, harmonised: bool): - for variant in variants: - if harmonised: - # if harmonised data are available, always overwrite - if variant['hm_chr']: - variant['chr_name'] = variant['hm_chr'] - - if variant['hm_pos']: - variant['chr_position'] = variant['hm_pos'] - - if 'hm_inferOtherAllele' in variant and variant.get('other_allele') is None: - logger.debug( - "Replacing missing other_allele with inferred other allele") - variant['other_allele'] = variant['hm_inferOtherAllele'] - - yield variant - else: + n_bad = 0 + if harmonised: + for variant in variants: + if variant["hm_chr"]: + variant["chr_name"] = variant["hm_chr"] + + if variant["hm_pos"]: + variant["chr_position"] = variant["hm_pos"] + + if "hm_inferOtherAllele" in variant and variant.get("other_allele") is None: + variant["other_allele"] = variant["hm_inferOtherAllele"] + + if ( + "chr_name" in variant + and "chr_position" in variant + and "effect_weight" in variant + ): + yield variant + elif Config.drop_missing: + continue + # (don't yield anything, filtering out missing variants) + else: + # assume a bad harmonisation with no genomic coordinates + # these will get labelled as duplicates eventually (probably) + variant["chr_name"] = None + variant["chr_position"] = None + yield variant + n_bad += 1 + else: + for variant in variants: yield variant + + if n_bad > 1: + logger.warning(f"{n_bad} variants failed harmonisation") diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index 0d20df2..e1aea8f 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -29,10 +29,11 @@ class ScoringFile: def __post_init__(self): if self.header.HmPOS_build: logger.info( - f"{self.path} harmonised data detected: {self.header.HmPOS_build}") + f"{self.path} harmonised data detected: {self.header.HmPOS_build}" + ) self.genome_build = self.header.HmPOS_build - mandatory_columns = {'chr_name', 'effect_allele', 'effect_weight'} + mandatory_columns = {"chr_name", "effect_allele", "effect_weight"} if not mandatory_columns.issubset(self.fields) not in self.fields: err_msg = f"{self.path} missing fields" raise Exception(err_msg) @@ -40,7 +41,7 @@ def __post_init__(self): @classmethod def from_path(cls, path: pathlib.Path): header = ScoringFileHeader.from_path(path) - name = os.path.basename(path).split('.')[0] + name = os.path.basename(path).split(".")[0] if header: if header.HmPOS_build: harmonised = True @@ -56,24 +57,29 @@ def from_path(cls, path: pathlib.Path): # generate variants (a list of dicts, one for each variants) logger.info(f"Lazily reading variants from {path}") - variants = ScoringFile.read_variants(path=path, start_line=start_line, - fields=cols, name=name) + variants = ScoringFile.read_variants( + path=path, start_line=start_line, fields=cols, name=name + ) # note: these generator expressions aren't doing a bunch of iterations # it's just a data processing pipeline - variants = quality_control(variants, harmonised) - - return cls(path=path, header=header, genome_build=genome_build, - harmonised=harmonised, - fields=cols, - variants=variants, - accession=name) + variants = quality_control(variants, header=header, harmonised=harmonised) + + return cls( + path=path, + header=header, + genome_build=genome_build, + harmonised=harmonised, + fields=cols, + variants=variants, + accession=name, + ) @staticmethod def read_variants(path, fields, start_line, name: str): open_function = auto_open(path) - with open_function(path, mode='rt') as f: - row_nr = 0 # row_nr + with open_function(path, mode="rt") as f: + row_nr = 0 # row_nr for _ in range(start_line + 1): # skip header next(f) @@ -83,20 +89,34 @@ def read_variants(path, fields, start_line, name: str): if not batch: break - csv_reader = csv.reader(batch, delimiter='\t') + csv_reader = csv.reader(batch, delimiter="\t") for row in csv_reader: - variant = dict(zip(fields, row)) | {'accession': name, "row_nr": row_nr } - keys = ["chr_name", "chr_position", "effect_allele", "other_allele", - "effect_weight", "hm_chr", "hm_pos", "hm_inferOtherAllele", - "is_dominant", "is_recessive", "accession", "row_nr"] + variant = dict(zip(fields, row)) | { + "accession": name, + "row_nr": row_nr, + } + keys = [ + "chr_name", + "chr_position", + "effect_allele", + "other_allele", + "effect_weight", + "hm_chr", + "hm_pos", + "hm_inferOtherAllele", + "is_dominant", + "is_recessive", + "accession", + "row_nr", + ] yield {k: variant[k] for k in keys if k in variant} row_nr += 1 def get_columns(path) -> tuple[int, list[str]]: open_function = auto_open(path) - with open_function(path, mode='rt') as f: + with open_function(path, mode="rt") as f: for i, line in enumerate(f): - if line.startswith('#'): + if line.startswith("#"): continue - return i, line.strip().split('\t') + return i, line.strip().split("\t") diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 9345a87..8a31fb6 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -19,17 +19,27 @@ def write_combined(scoring_files: list[ScoringFile], out_path: str): open_function = functools.partial(gzip.open, compresslevel=6) elif Config.threads > 1: logger.info("Writing with pgzip (fast)") - open_function = functools.partial(pgzip.open, compresslevel=6, - thread=Config.threads, blocksize=2 * 10 ** 8) + open_function = functools.partial( + pgzip.open, compresslevel=6, thread=Config.threads, blocksize=2 * 10**8 + ) else: logger.info("Writing text file (fast)") open_function = open - with open_function(out_path, mode='wt') as f: - fieldnames = ["chr_name", "chr_position", "effect_allele", - "other_allele", "effect_weight", "effect_type", "accession", "row_nr"] - writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t', - extrasaction='ignore') + with open_function(out_path, mode="wt") as f: + fieldnames = [ + "chr_name", + "chr_position", + "effect_allele", + "other_allele", + "effect_weight", + "effect_type", + "accession", + "row_nr", + ] + writer = csv.DictWriter( + f, fieldnames=fieldnames, delimiter="\t", extrasaction="ignore" + ) writer.writeheader() # write out in batches for compression efficiency and speed From 43046c7f1141bd63a52ef37aae75d1cf2eda7f26 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 1 Nov 2023 16:20:27 +0000 Subject: [PATCH 07/40] update dependencies and set up pre-commit --- .pre-commit-config.yaml | 6 +++ poetry.lock | 110 ++++++++++++++++++++++++++++++++++++++-- pyproject.toml | 1 + 3 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..6e4ae2c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.1.3 + hooks: + - id: ruff-format \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index dac6b3d..4c1fda4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "anyio" @@ -297,6 +297,17 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "cfgv" +version = "3.4.0" +description = "Validate configuration and produce human readable error messages." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, + {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, +] + [[package]] name = "charset-normalizer" version = "3.2.0" @@ -671,6 +682,17 @@ files = [ {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, ] +[[package]] +name = "distlib" +version = "0.3.7" +description = "Distribution utilities" +optional = false +python-versions = "*" +files = [ + {file = "distlib-0.3.7-py2.py3-none-any.whl", hash = "sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057"}, + {file = "distlib-0.3.7.tar.gz", hash = "sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8"}, +] + [[package]] name = "exceptiongroup" version = "1.1.3" @@ -713,6 +735,22 @@ files = [ [package.extras] devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] +[[package]] +name = "filelock" +version = "3.13.1" +description = "A platform independent file lock." +optional = false +python-versions = ">=3.8" +files = [ + {file = "filelock-3.13.1-py3-none-any.whl", hash = "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c"}, + {file = "filelock-3.13.1.tar.gz", hash = "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e"}, +] + +[package.extras] +docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.24)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"] +typing = ["typing-extensions (>=4.8)"] + [[package]] name = "fonttools" version = "4.42.1" @@ -781,6 +819,20 @@ files = [ {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"}, ] +[[package]] +name = "identify" +version = "2.5.31" +description = "File identification library for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "identify-2.5.31-py2.py3-none-any.whl", hash = "sha256:90199cb9e7bd3c5407a9b7e81b4abec4bb9d249991c79439ec8af740afc6293d"}, + {file = "identify-2.5.31.tar.gz", hash = "sha256:7736b3c7a28233637e3c36550646fc6389bedd74ae84cb788200cc8e2dd60b75"}, +] + +[package.extras] +license = ["ukkonen"] + [[package]] name = "idna" version = "3.4" @@ -1737,6 +1789,20 @@ files = [ {file = "nest_asyncio-1.5.8.tar.gz", hash = "sha256:25aa2ca0d2a5b5531956b9e273b45cf664cae2b145101d73b86b199978d48fdb"}, ] +[[package]] +name = "nodeenv" +version = "1.8.0" +description = "Node.js virtual environment builder" +optional = false +python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*" +files = [ + {file = "nodeenv-1.8.0-py2.py3-none-any.whl", hash = "sha256:df865724bb3c3adc86b3876fa209771517b0cfe596beff01a92700e0e8be4cec"}, + {file = "nodeenv-1.8.0.tar.gz", hash = "sha256:d51e0c37e64fbf47d017feac3145cdbb58836d7eee8c6f6d3b6880c5456227d2"}, +] + +[package.dependencies] +setuptools = "*" + [[package]] name = "notebook" version = "7.0.4" @@ -1871,8 +1937,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, ] python-dateutil = ">=2.8.1" pytz = ">=2020.1" @@ -2082,6 +2148,24 @@ pyarrow = ["pyarrow (>=4.0.0)"] timezone = ["backports.zoneinfo", "tzdata"] xlsx2csv = ["xlsx2csv (>=0.8.0)"] +[[package]] +name = "pre-commit" +version = "3.5.0" +description = "A framework for managing and maintaining multi-language pre-commit hooks." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pre_commit-3.5.0-py2.py3-none-any.whl", hash = "sha256:841dc9aef25daba9a0238cd27984041fa0467b4199fc4852e27950664919f660"}, + {file = "pre_commit-3.5.0.tar.gz", hash = "sha256:5804465c675b659b0862f07907f96295d490822a450c4c40e747d0b1c6ebcb32"}, +] + +[package.dependencies] +cfgv = ">=2.0.0" +identify = ">=1.0.0" +nodeenv = ">=0.11.1" +pyyaml = ">=5.1" +virtualenv = ">=20.10.0" + [[package]] name = "prometheus-client" version = "0.17.1" @@ -3031,6 +3115,26 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17. socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "virtualenv" +version = "20.24.6" +description = "Virtual Python Environment builder" +optional = false +python-versions = ">=3.7" +files = [ + {file = "virtualenv-20.24.6-py3-none-any.whl", hash = "sha256:520d056652454c5098a00c0f073611ccbea4c79089331f60bf9d7ba247bb7381"}, + {file = "virtualenv-20.24.6.tar.gz", hash = "sha256:02ece4f56fbf939dbbc33c0715159951d6bf14aaf5457b092e4548e1382455af"}, +] + +[package.dependencies] +distlib = ">=0.3.7,<1" +filelock = ">=3.12.2,<4" +platformdirs = ">=3.9.1,<4" + +[package.extras] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] + [[package]] name = "wcwidth" version = "0.2.6" @@ -3157,4 +3261,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "2859497817dfd52518f4fa2ba527c716a5bb5e4354175f791b314e80a033edf2" +content-hash = "b9985d182b0c350a39e12aeae274f2e809d1454f47b58b2d2a5fe8b8264418b7" diff --git a/pyproject.toml b/pyproject.toml index 9d8a99c..15a3b9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ polars = "^0.15.0" zstandard = "^0.18.0" pgzip = "^0.3.2" scikit-learn = "^1.2.1" +pre-commit = "^3.5.0" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" From a947a39a87f801eb116bd33b20ea5a96261052f5 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 1 Nov 2023 16:27:20 +0000 Subject: [PATCH 08/40] complain when linting fails --- .pre-commit-config.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6e4ae2c..98d8851 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,4 +3,6 @@ repos: # Ruff version. rev: v0.1.3 hooks: - - id: ruff-format \ No newline at end of file + - id: ruff-format + - id: ruff + args: [--fix, --exit-non-zero-on-fix] \ No newline at end of file From 32ef39c3befb3b3cb0b9986024ee7c5467b88a37 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 1 Nov 2023 16:30:03 +0000 Subject: [PATCH 09/40] fix linting --- .pre-commit-config.yaml | 4 ++-- pgscatalog_utils/scorefile/liftover.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 98d8851..f7d0c74 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,6 +3,6 @@ repos: # Ruff version. rev: v0.1.3 hooks: - - id: ruff-format - id: ruff - args: [--fix, --exit-non-zero-on-fix] \ No newline at end of file + args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py index 9924916..7e35b23 100644 --- a/pgscatalog_utils/scorefile/liftover.py +++ b/pgscatalog_utils/scorefile/liftover.py @@ -45,7 +45,7 @@ def liftover( n += 1 if (n_lifted / n) < Config.min_lift: - logger.error(f"Liftover failed") + logger.error("Liftover failed") raise Exception else: logger.info("Liftover successful") From eb9d362b788b7095d251c2ee668a93f642d453e7 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 2 Nov 2023 12:30:17 +0000 Subject: [PATCH 10/40] support wide files --- pgscatalog_utils/scorefile/header.py | 17 ++--- pgscatalog_utils/scorefile/qc.py | 9 ++- pgscatalog_utils/scorefile/scoringfile.py | 93 ++++++++++++++++------- 3 files changed, 82 insertions(+), 37 deletions(-) diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py index 3a5e889..0397754 100644 --- a/pgscatalog_utils/scorefile/header.py +++ b/pgscatalog_utils/scorefile/header.py @@ -24,31 +24,30 @@ class ScoringFileHeader: citation: str def __post_init__(self): - self.variants_number = int(self.variants_number) + if self.variants_number: + self.variants_number = int(self.variants_number) + self.genome_build = GenomeBuild.from_string(self.genome_build) if self.HmPOS_build: self.HmPOS_build = GenomeBuild.from_string(self.HmPOS_build) - if self.format_version != "2.0": - raise Exception("Only support v2 format") - @classmethod def from_path(cls, path: pathlib.Path): raw_header: dict = raw_header_to_dict(read_header(path)) - # only keep keys needed by class (intersect) + # only keep keys needed by class but support partial headers with None values keep_keys = ScoringFileHeader.__annotations__.keys() - header_dict = {k: raw_header[k] for k in raw_header.keys() & keep_keys} + header_dict = {k: raw_header.get(k) for k in keep_keys} # ... so we can unpack the dict into a dataclass - if len(header_dict) > 1 and "HmPOS_build" not in header_dict: + if "HmPOS_build" not in header_dict: # working with pgs catalog formatted header but unharmonised data header_dict["HmPOS_build"] = None - if header_dict: + if not all([v is None for _, v in header_dict.items()]): return ScoringFileHeader(**header_dict) else: # no header available - return None + raise Exception("No header detected in scoring file") def raw_header_to_dict(header): diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 44d907f..3513903 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -8,7 +8,7 @@ logger = logging.getLogger(__name__) -def quality_control(variants, header: ScoringFileHeader, harmonised: bool): +def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide: bool): variants = remap_harmonised(variants, harmonised) if Config.drop_missing: @@ -17,6 +17,13 @@ def quality_control(variants, header: ScoringFileHeader, harmonised: bool): variants = assign_effect_type(variants) variants = check_effect_weight(variants) variants = assign_other_allele(variants) + + if wide: + # wide data must be sorted because: + # - check_duplicates requires sorted input + # - output would be unsorted, which looks a little bit messy + variants = (x for x in sorted(variants, key=lambda x: x["accession"])) + variants = check_duplicates(variants) if Config.liftover: diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index e1aea8f..0908576 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -54,16 +54,18 @@ def from_path(cls, path: pathlib.Path): genome_build = None start_line, cols = get_columns(path) + is_wide = detect_wide(cols) - # generate variants (a list of dicts, one for each variants) logger.info(f"Lazily reading variants from {path}") variants = ScoringFile.read_variants( - path=path, start_line=start_line, fields=cols, name=name + path=path, start_line=start_line, fields=cols, name=name, is_wide=is_wide ) # note: these generator expressions aren't doing a bunch of iterations # it's just a data processing pipeline - variants = quality_control(variants, header=header, harmonised=harmonised) + variants = quality_control( + variants, header=header, harmonised=harmonised, wide=is_wide + ) return cls( path=path, @@ -76,10 +78,10 @@ def from_path(cls, path: pathlib.Path): ) @staticmethod - def read_variants(path, fields, start_line, name: str): + def read_variants(path, fields, start_line, name: str, is_wide: bool): open_function = auto_open(path) + row_nr = 0 with open_function(path, mode="rt") as f: - row_nr = 0 # row_nr for _ in range(start_line + 1): # skip header next(f) @@ -90,27 +92,46 @@ def read_variants(path, fields, start_line, name: str): break csv_reader = csv.reader(batch, delimiter="\t") - for row in csv_reader: - variant = dict(zip(fields, row)) | { - "accession": name, - "row_nr": row_nr, - } - keys = [ - "chr_name", - "chr_position", - "effect_allele", - "other_allele", - "effect_weight", - "hm_chr", - "hm_pos", - "hm_inferOtherAllele", - "is_dominant", - "is_recessive", - "accession", - "row_nr", - ] - yield {k: variant[k] for k in keys if k in variant} - row_nr += 1 + yield from read_rows(csv_reader, fields, name, row_nr, is_wide) + + +def read_rows(csv_reader, fields: list[str], name: str, row_nr: int, wide: bool): + for row in csv_reader: + variant = dict(zip(fields, row)) + + if wide: + ew_col_idxs: list[int] = [ + i for i, x in enumerate(["effect_weight_" in x for x in fields]) if x + ] + for i, weight_name in zip(ew_col_idxs, [fields[i] for i in ew_col_idxs]): + keys = ["chr_name", "chr_position", "effect_allele", "other_allele"] + yield {k: variant[k] for k in keys if k in variant} | { + "accession": weight_name, + "row_nr": row_nr, + "effect_weight": variant[weight_name], + } + else: + keys = [ + "chr_name", + "chr_position", + "effect_allele", + "other_allele", + "effect_weight", + "hm_chr", + "hm_pos", + "hm_inferOtherAllele", + "is_dominant", + "is_recessive", + "accession", + "row_nr", + ] + + yield {k: variant[k] for k in keys if k in variant} | { + "accession": name, + "row_nr": row_nr, + } + + row_nr += 1 def get_columns(path) -> tuple[int, list[str]]: @@ -119,4 +140,22 @@ def get_columns(path) -> tuple[int, list[str]]: for i, line in enumerate(f): if line.startswith("#"): continue - return i, line.strip().split("\t") + line_no, cols = i, line.strip().split("\t") + if len(set(cols)) != len(cols): + logger.critical(f"Duplicated column names: {cols}") + raise ValueError + + return line_no, cols + + +def detect_wide(cols: list[str]) -> bool: + """ + Check columns to see if multiple effect weights are present. Multiple effect weights must be present in the form: + effect_weight_suffix1 + effect_weight_suffix2 + """ + if any(["effect_weight_" in x for x in cols]): + logger.info("Wide scoring file detected with multiple effect weights") + return True + else: + return False From 1774e01c575cf135270ec991ea1677e12f113a71 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 2 Nov 2023 15:55:08 +0000 Subject: [PATCH 11/40] add log --- pgscatalog_utils/download/GenomeBuild.py | 17 ++++++----- .../scorefile/combine_scorefiles.py | 11 ++++++- pgscatalog_utils/scorefile/header.py | 12 ++++---- pgscatalog_utils/scorefile/scoringfile.py | 29 ++++++++++++++++++- pgscatalog_utils/scorefile/write.py | 5 ++++ 5 files changed, 60 insertions(+), 14 deletions(-) diff --git a/pgscatalog_utils/download/GenomeBuild.py b/pgscatalog_utils/download/GenomeBuild.py index 893bf97..23c8984 100644 --- a/pgscatalog_utils/download/GenomeBuild.py +++ b/pgscatalog_utils/download/GenomeBuild.py @@ -1,18 +1,21 @@ -from enum import Enum, auto +from enum import Enum class GenomeBuild(Enum): - GRCh37 = auto() - GRCh38 = auto() + GRCh37 = "GRCh37" + GRCh38 = "GRCh38" + + def __str__(self): + return str(self.value) @classmethod def from_string(cls, build): match build: - case 'GRCh37' | 'hg18': + case "GRCh37" | "hg18": return cls(GenomeBuild.GRCh37) - case 'GRCh38' | 'hg19': + case "GRCh38" | "hg19": return cls(GenomeBuild.GRCh38) - case 'NR': + case "NR": return None case _: - raise Exception \ No newline at end of file + raise Exception diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index e3ce606..8e3bea8 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -1,4 +1,5 @@ import argparse +import json import logging import sys import textwrap @@ -44,7 +45,15 @@ def combine_scorefiles(): else: logger.info(f"All builds match target build {target_build}") - write_combined(sfs, args.outfile) + line_counts: dict[str, int] = write_combined(sfs, args.outfile) + # provide line counts when making the scoring files + log = [] + for (k, v), sf in zip(line_counts.items(), sfs): + log.append(sf.generate_log(v)) + + with open(args.logfile, "w") as f: + logger.info(f"Writing log to {f.name}") + json.dump(log, f, indent=4) end_time = time.time() elapsed_time = end_time - start_time diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py index 0397754..e9a03e4 100644 --- a/pgscatalog_utils/scorefile/header.py +++ b/pgscatalog_utils/scorefile/header.py @@ -13,15 +13,17 @@ class ScoringFileHeader: pgs_id: str pgp_id: str - trait_efo: str - trait_reported: str - trait_mapped: str pgs_name: str genome_build: GenomeBuild - HmPOS_build: GenomeBuild variants_number: int - format_version: str + trait_reported: str + trait_efo: str + trait_mapped: str + weight_type: str citation: str + HmPOS_build: GenomeBuild + HmPOS_date: str + format_version: str def __post_init__(self): if self.variants_number: diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index 0908576..c879b85 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -77,10 +77,36 @@ def from_path(cls, path: pathlib.Path): accession=name, ) + def generate_log(self, line_count: int): + log = { + key: str(value) if value is not None else None + for key, value in self.header.__dict__.items() + } + + if log["variants_number"] is None: + # custom scoring files might not have this information + log["variants_number"] = line_count + + # multiple terms may be separated with a pipe + if log["trait_mapped"]: + log["trait_mapped"] = log["trait_mapped"].split("|") + + if log["trait_efo"]: + log["trait_efo"] = log["trait_efo"].split("|") + + log["columns"] = self.fields + log["use_liftover"] = Config.liftover + log["use_harmonised"] = self.harmonised + + return {self.accession: log} + @staticmethod def read_variants(path, fields, start_line, name: str, is_wide: bool): open_function = auto_open(path) - row_nr = 0 + # row_nr and cum_batch are equivalent but + row_nr = 0 # important to increment in sub-generator for each line + cum_batch = 0 # sums batches in this function + with open_function(path, mode="rt") as f: for _ in range(start_line + 1): # skip header @@ -88,6 +114,7 @@ def read_variants(path, fields, start_line, name: str, is_wide: bool): while True: batch = list(islice(f, Config.batch_size)) + cum_batch += len(batch) if not batch: break diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 8a31fb6..d0a32bb 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -42,6 +42,7 @@ def write_combined(scoring_files: list[ScoringFile], out_path: str): ) writer.writeheader() + line_counts = {} # write out in batches for compression efficiency and speed for scoring_file in scoring_files: logger.info(f"Writing {scoring_file.accession} variants") @@ -49,4 +50,8 @@ def write_combined(scoring_files: list[ScoringFile], out_path: str): batch = list(islice(scoring_file.variants, Config.batch_size)) if not batch: break + # calculate max row_nr now because it's when we finally generate variants + line_counts[scoring_file.accession] = max(x["row_nr"] for x in batch) writer.writerows(batch) + + return line_counts From 7cf956291f0f76b7ebf1339a42593e89791d374d Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 3 Nov 2023 12:25:34 +0000 Subject: [PATCH 12/40] fix tests and liftover --- conftest.py | 180 ++-- .../scorefile/combine_scorefiles.py | 4 +- pgscatalog_utils/scorefile/liftover.py | 12 +- pgscatalog_utils/scorefile/qc.py | 10 +- pgscatalog_utils/scorefile/scoringfile.py | 2 +- tests/data/combine/PGS001229_22.txt | 850 ++++++++++++++++++ tests/data/combine/scorefile.txt | 838 +++++++++++++++++ .../scorefile_dominant_and_recessive.txt | 838 +++++++++++++++++ tests/test_combine.py | 144 ++- tests/test_liftover.py | 42 +- 10 files changed, 2783 insertions(+), 137 deletions(-) create mode 100644 tests/data/combine/PGS001229_22.txt create mode 100644 tests/data/combine/scorefile.txt create mode 100644 tests/data/combine/scorefile_dominant_and_recessive.txt diff --git a/conftest.py b/conftest.py index a4a55c6..ba3e065 100644 --- a/conftest.py +++ b/conftest.py @@ -1,11 +1,9 @@ import glob import importlib.resources import os -import pathlib import shutil from unittest.mock import patch -import pandas as pd import polars as pl import pytest import requests as req @@ -14,20 +12,51 @@ from pgscatalog_utils.match.preprocess import complement_valid_alleles from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles +from tests.data import combine + pl.toggle_string_cache(True) @pytest.fixture(scope="session") def pgs_accessions(): - return ['PGS001229', 'PGS000922'] + return ["PGS001229", "PGS000922"] + + +@pytest.fixture(scope="session") +def mini_score_path(tmp_path_factory): + path = importlib.resources.files(combine) / "PGS001229_22.txt" + return path + + +@pytest.fixture(scope="session") +def mini_scorefile(mini_score_path, tmp_path_factory): + # The mini scorefile overlaps well with cineca synthetic subset + out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt" + args: list[str] = ( + ["combine_scorefiles", "-t", "GRCh37", "-s"] + + [mini_score_path] + + ["-o", str(out_path.resolve())] + ) + + with patch("sys.argv", args): + combine_scorefiles() + + return str(out_path.resolve()) @pytest.fixture(scope="session") def scorefiles(tmp_path_factory, pgs_accessions): fn = tmp_path_factory.mktemp("scorefiles") - args: list[str] = ['download_scorefiles', '-b', 'GRCh37', '-o', str(fn.resolve()), '-i'] + pgs_accessions - - with patch('sys.argv', args): + args: list[str] = [ + "download_scorefiles", + "-b", + "GRCh37", + "-o", + str(fn.resolve()), + "-i", + ] + pgs_accessions + + with patch("sys.argv", args): download_scorefile() return glob.glob(os.path.join(fn.resolve(), "*.txt.gz")) @@ -37,8 +66,9 @@ def scorefiles(tmp_path_factory, pgs_accessions): def target_path(tmp_path_factory): try: bim = req.get( - 'https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim', - timeout=5) + "https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim", + timeout=5, + ) except (req.exceptions.ConnectionError, req.Timeout): bim = [] @@ -46,129 +76,85 @@ def target_path(tmp_path_factory): pytest.skip("Couldn't get test data from network") else: fn = tmp_path_factory.mktemp("target") / "data.bim" - with open(fn, 'wb') as f: + with open(fn, "wb") as f: f.write(bim.content) return str(fn.resolve()) -@pytest.fixture(scope="session") -def mini_score_path(tmp_path_factory): - try: - score = req.get('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/PGS001229_22.txt', - timeout=5) - except (req.exceptions.ConnectionError, req.Timeout): - score = [] - - if not score: - pytest.skip("Couldn't get test data from network") - else: - fn = tmp_path_factory.mktemp("score") / "PGS001229_22.txt" - with open(fn, 'wb') as f: - f.write(score.content) - - return str(fn.resolve()) - - -@pytest.fixture(scope="session") -def mini_scorefile(mini_score_path, tmp_path_factory): - # The mini scorefile overlaps well with cineca synthetic subset - out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt" - args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())] - - with patch('sys.argv', args): - combine_scorefiles() - - return str(out_path.resolve()) - - -@pytest.fixture(scope="session") -def combined_scorefile(scorefiles, tmp_path_factory): - # The combined scorefile overlaps poorly with cineca synthetic subset - out_path = tmp_path_factory.mktemp("scores") / "combined.txt" - args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + scorefiles + ['-o', str(out_path.resolve())] - - with patch('sys.argv', args): - combine_scorefiles() - - return str(out_path.resolve()) - - @pytest.fixture(scope="session") def chain_files(tmp_path_factory): - chain_dir = tmp_path_factory.mktemp('chain_dir') + chain_dir = tmp_path_factory.mktemp("chain_dir") shutil.copy2("tests/data/hg19ToHg38.over.chain.gz", chain_dir) shutil.copy2("tests/data/hg38ToHg19.over.chain.gz", chain_dir) - - return str(chain_dir.resolve()) - - -@pytest.fixture(scope="session") -def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory): - out_path = tmp_path_factory.mktemp("scores") / "lifted.txt" - args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t', - 'GRCh38', - '-m', '0.8'] + ['-o', str(out_path.resolve())] - - with patch('sys.argv', args): - combine_scorefiles() - return str(out_path.resolve()) + return str(chain_dir.resolve()) @pytest.fixture(scope="session") def hg38_coords(): - d = {'rsid': ['rs11903757', 'rs6061231'], 'chr_name': ['2', '20'], 'chr_position': [191722478, 62381861]} - df = pd.DataFrame(d) - df['accession'] = 'dummy' - df['genome_build'] = 'GRCh38' - return df + rs11903757 = {"rsid": "rs11903757", "chr_name": "2", "chr_position": 191722478} + rs6061231 = {"rsid": "rs6061231", "chr_name": "20", "chr_position": 62381861} + return [rs11903757, rs6061231] @pytest.fixture(scope="session") -def hg19_coords(hg38_coords): +def hg19_coords(): # hg38_coords in GRCh37, from dbSNP - d = {'lifted_chr': ['2', '20'], 'lifted_pos': [192587204, 60956917], 'liftover': [True, True]} - return pd.DataFrame(d) + rs11903757 = {"rsid": "rs11903757", "chr_name": "2", "chr_position": 192587204} + rs6061231 = {"rsid": "rs6061231", "chr_name": "20", "chr_position": 60956917} + return [rs11903757, rs6061231] -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def small_flipped_scorefile(small_scorefile): # simulate a scorefile on the wrong strand - return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele']) - .drop(['effect_allele', 'other_allele']) - .rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'}) - .pipe(complement_valid_alleles, ['effect_allele', 'other_allele'])) + return ( + complement_valid_alleles(small_scorefile, ["effect_allele", "other_allele"]) + .drop(["effect_allele", "other_allele"]) + .rename( + {"effect_allele_FLIP": "effect_allele", "other_allele_FLIP": "other_allele"} + ) + .pipe(complement_valid_alleles, ["effect_allele", "other_allele"]) + ) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def small_target(): - return pl.DataFrame({"#CHROM": [1, 2, 3], - "POS": [1, 2, 3], - "REF": ["A", "T", "T"], - "ALT": ["C", "A", "G"], - "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"], - "is_multiallelic": [False, False, False]}) + return pl.DataFrame( + { + "#CHROM": [1, 2, 3], + "POS": [1, 2, 3], + "REF": ["A", "T", "T"], + "ALT": ["C", "A", "G"], + "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"], + "is_multiallelic": [False, False, False], + } + ) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def small_scorefile(): - df = pl.DataFrame({"accession": ["test", "test", "test"], - "row_nr": [1, 2, 3], - "chr_name": [1, 2, 3], - "chr_position": [1, 2, 3], - "effect_allele": ["A", "A", "G"], - "other_allele": ["C", "T", "T"], - "effect_weight": [1, 2, 3], - "effect_type": ["additive", "additive", "additive"]}) + df = pl.DataFrame( + { + "accession": ["test", "test", "test"], + "row_nr": [1, 2, 3], + "chr_name": [1, 2, 3], + "chr_position": [1, 2, 3], + "effect_allele": ["A", "A", "G"], + "other_allele": ["C", "T", "T"], + "effect_weight": [1, 2, 3], + "effect_type": ["additive", "additive", "additive"], + } + ) return complement_valid_alleles(df, ["effect_allele", "other_allele"]) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def small_scorefile_no_oa(small_scorefile): - return small_scorefile.with_column(pl.lit(None).alias('other_allele')) + return small_scorefile.with_column(pl.lit(None).alias("other_allele")) def _get_timeout(url): diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 8e3bea8..fbd3082 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -1,6 +1,7 @@ import argparse import json import logging +import pathlib import sys import textwrap import time @@ -51,7 +52,8 @@ def combine_scorefiles(): for (k, v), sf in zip(line_counts.items(), sfs): log.append(sf.generate_log(v)) - with open(args.logfile, "w") as f: + log_out_path = pathlib.Path(args.outfile).parent / args.logfile + with open(log_out_path, "w") as f: logger.info(f"Writing log to {f.name}") json.dump(log, f, indent=4) diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py index 7e35b23..8097b70 100644 --- a/pgscatalog_utils/scorefile/liftover.py +++ b/pgscatalog_utils/scorefile/liftover.py @@ -20,13 +20,15 @@ def liftover( skip_lo = False if skip_lo: + logger.info("Skipping liftover") for variant in variants: yield variant else: + logger.info("Starting liftover") if current_build == GenomeBuild.GRCh37 and target_build == GenomeBuild.GRCh38: lo: pyliftover.LiftOver = Config.lo["hg19hg38"] elif current_build == GenomeBuild.GRCh38 and target_build == GenomeBuild.GRCh37: - lo: pyliftover.LiftOver = Config.lo["hg19hg38"] + lo: pyliftover.LiftOver = Config.lo["hg38hg19"] else: raise Exception("Can't get pyliftover object") @@ -40,12 +42,16 @@ def liftover( if lifted: variant["chr_name"] = lifted[0][0][3:].split("_")[0] variant["chr_position"] = lifted[0][1] + 1 # reverse 0 indexing + variant["lifted"] = True + yield variant n_lifted += 1 - yield variant + else: + variant["lifted"] = False + yield variant n += 1 if (n_lifted / n) < Config.min_lift: - logger.error("Liftover failed") + logger.error("Liftover failed for variant {variant}") raise Exception else: logger.info("Liftover successful") diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 3513903..8454fe4 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -117,14 +117,12 @@ def assign_effect_type(variants): for variant in variants: if "is_recessive" not in variant and "is_dominant" not in variant: variant["effect_type"] = "additive" - - if "is_recessive" in variant or "is_dominant" in variant: - logger.info("Recessive or dominant variant detected") - if variant["is_recessive"]: + else: + if variant["is_recessive"] == "TRUE": variant["effect_type"] = "recessive" - elif variant["is_dominant"]: + elif variant["is_dominant"] == "TRUE": variant["effect_type"] = "dominant" - elif variant["is_recessive"] and variant["is_dominant"]: + elif variant["is_recessive"] == "TRUE" and variant["is_dominant"] == "TRUE": logger.critical(f"Bad effect type setting: {variant}") raise Exception diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index c879b85..1b44c6c 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -85,7 +85,7 @@ def generate_log(self, line_count: int): if log["variants_number"] is None: # custom scoring files might not have this information - log["variants_number"] = line_count + log["variants_number"] = line_count + 1 # (0 indexed) # multiple terms may be separated with a pipe if log["trait_mapped"]: diff --git a/tests/data/combine/PGS001229_22.txt b/tests/data/combine/PGS001229_22.txt new file mode 100644 index 0000000..5f791f4 --- /dev/null +++ b/tests/data/combine/PGS001229_22.txt @@ -0,0 +1,850 @@ +###PGS CATALOG SCORING FILE - see https://https://www.pgscatalog.org/downloads/#dl_ftp_scoring_scoring for additional information +#format_version=2.0 +##POLYGENIC SCORE (PGS) INFORMATION +#pgs_id=PGS001229 +#pgs_name=GBE_INI50 +#trait_reported=Standing height +#trait_mapped=body height +#trait_efo=EFO_0004339 +#weight_type=NR +#genome_build=GRCh37 +#variants_number=51209 +##SOURCE INFORMATION +#pgp_id=PGP000244 +#citation=Tanigawa Y et al. medRxiv (2021). doi:10.1101/2021.09.02.21262942 +rsID chr_name chr_position effect_allele other_allele effect_weight is_haplotype imputation_method locus_name variant_description +rs5746679 22 17080378 G A 1.045457e-02 False +rs2192430 22 17300230 A G 1.411475e-04 False +rs165636 22 17318864 A C 8.166266e-03 False +rs165808 22 17327595 T C 7.791641e-03 False +rs5748743 22 17409813 A G 3.108784e-04 False +rs11703655 22 17450952 G A -3.033983e-02 False +rs2192155 22 17492533 G A 3.889990e-03 False +rs2845402 22 17542810 C T 8.036290e-03 False +rs4819958 22 17565013 G A 2.135621e-02 False +rs879577 22 17589209 T C 3.026491e-03 False +rs5994165 22 17600977 A G 1.581277e-02 False +rs35665085 22 17625915 A G -1.172964e-01 False +rs1034859 22 17630486 A C 1.012909e-02 False +rs738032 22 17633785 C T 2.325500e-03 False +rs5994185 22 17643689 A G 3.361814e-03 False +rs2231495 22 17669306 C T 2.145060e-02 False +rs5747018 22 17677699 T C -7.031384e-04 False +rs17807317 22 17680519 C A 1.079236e-03 False +rs9606655 22 17701234 G A 4.477145e-03 False +rs78718739 22 17703119 A T 7.771872e-04 False +rs73153427 22 17718699 C A -1.320632e-02 False +rs4266110 22 17721595 C T 9.480363e-03 False +rs4819982 22 17727648 T C 7.811685e-03 False +rs5749000 22 17738177 G A -4.719812e-03 False +rs5749002 22 17749096 A G -5.244795e-03 False +rs11704699 22 17770181 G T -3.101703e-02 False +rs5749032 22 17793969 G A 1.774444e-02 False +rs5994272 22 17815696 G C -5.516090e-03 False +rs4820001 22 17827684 G A -5.944752e-03 False +rs2040692 22 17831813 T C 1.061587e-02 False +rs9606701 22 17844929 T G 1.717643e-03 False +rs73391753 22 17850661 T C -2.805489e-02 False +rs4819581 22 17887534 A G 7.723542e-04 False +rs2522310 22 17887725 A G 7.472703e-03 False +rs5747199 22 17958221 C A -2.098647e-02 False +rs174346 22 18036253 G A -1.772981e-02 False +rs174351 22 18038786 A G -2.119071e-03 False +rs9605406 22 18262301 A T -5.065485e-03 False +rs389496 22 18289204 A G 5.306345e-03 False +rs399757 22 18295575 C T 2.300129e-02 False +rs1550663 22 18296238 G A -5.665446e-03 False +rs439231 22 18319179 T C 3.440642e-02 False +rs2401424 22 18393534 A C 1.764269e-02 False +rs1076540 22 18439958 T C -2.261707e-03 False +rs4819654 22 18483388 G A 3.318724e-02 False +rs9617650 22 18488883 C G -1.919180e-02 False +rs397709 22 18489048 C A 1.233198e-02 False +rs452579 22 18495470 A G -5.804926e-03 False +rs1992576 22 18537145 G A -4.930116e-03 False +rs464385 22 18571008 A G -8.844726e-05 False +rs3827281 22 18584433 C T -1.169893e-03 False +rs9618216 22 18631365 T C -1.551714e-03 False +rs3180408 22 18650682 T C -1.313784e-02 False +rs2010694 22 18890037 A G 5.968921e-02 False +rs454534 22 18891398 G A 6.891943e-03 False +rs2080346 22 18892575 A G -2.244470e-03 False +rs2016108 22 18915963 A G 3.719756e-03 False +rs2518810 22 18959581 T C 6.464581e-03 False +rs2019061 22 18963340 A G -1.397565e-02 False +rs11089247 22 18970915 T C -1.507131e-03 False +rs2073776 22 19024651 T C -3.505750e-03 False +rs712965 22 19121872 A G 1.644046e-02 False +rs2275901 22 19135603 A G -2.970077e-02 False +rs11089259 22 19190143 T C 3.268027e-03 False +rs361787 22 19263698 T C 2.057255e-02 False +rs8135222 22 19292446 G T 1.153989e-02 False +rs34292276 22 19371052 T C 1.055134e-02 False +rs1128399 22 19420109 C T -8.628228e-03 False +rs5748218 22 19451186 A C 2.141029e-02 False +rs5748260 22 19518079 C T 5.372247e-03 False +rs5993713 22 19581331 T C 1.686942e-02 False +rs9606090 22 19593854 C A 6.544249e-04 False +rs8135254 22 19606703 G A 2.070121e-02 False +rs9617823 22 19649005 A G 2.868601e-03 False +rs6518580 22 19735854 C T 6.262962e-03 False +rs1005133 22 19738355 T C 4.973840e-05 False +rs9680615 22 19770886 A G -1.013929e-02 False +rs2871043 22 19781823 T C 2.481609e-02 False +rs2073750 22 19873357 T C 1.163020e-02 False +rs5748469 22 19907099 A C -2.676450e-02 False +rs9618723 22 19968597 T C -2.203945e-02 False +rs5748515 22 20046344 G A -9.801428e-03 False +rs59528277 22 20084821 C T -2.232886e-02 False +rs625704 22 20185457 A G 6.892171e-03 False +rs672570 22 20189077 T C 1.738215e-02 False +rs7293032 22 20219648 A G 9.307625e-03 False +rs855050 22 20248391 A G -5.405845e-03 False +rs855061 22 20267213 A G 6.713242e-03 False +rs741413 22 20286099 G T 1.574758e-02 False +rs35012563 22 20749042 G A 6.603339e-03 False +rs361860 22 20754039 A G -1.181141e-02 False +rs1771145 22 20775167 T C 1.160113e-02 False +rs9680797 22 20780296 A G 6.735311e-02 False +rs1005640 22 20789074 C T 2.844307e-02 False +rs12628193 22 20791438 A C 4.734740e-02 False +rs1035239 22 20793914 C T 7.009781e-03 False +rs75179603 22 20839810 T G 3.947346e-03 False +rs738092 22 20860931 T C 5.613511e-04 False +rs10427922 22 20979980 G A 3.231665e-03 False +rs2080195 22 20991771 G A 4.226765e-03 False +rs5751800 22 21075537 C A -2.096453e-03 False +rs361979 22 21154393 G T -4.297086e-03 False +rs756878 22 21323357 C T -6.041745e-03 False +rs178275 22 21331918 G C -2.280912e-03 False +rs105034 22 21334924 C G -2.031369e-02 False +rs28372939 22 21356824 A G 1.476577e-02 False +rs2072550 22 21386019 A G 1.435557e-02 False +rs431319 22 21449028 G A -1.537701e-02 False +rs2845419 22 21463515 A G -1.335614e-02 False +rs2298428 22 21982892 T C -6.373335e-02 False +rs62235077 22 22001704 T G 2.809584e-02 False +rs76940365 22 22062480 T C 5.291130e-02 False +rs10427813 22 22080735 G A -1.394260e-02 False +rs78907487 22 22151939 C A -8.287849e-03 False +rs9607287 22 22163425 G A 5.518983e-02 False +rs412050 22 22307519 C G -3.486191e-03 False +rs79165737 22 22351283 G A -7.483763e-04 False +rs5844480 22 22394291 AG A 4.320583e-03 False +rs2213141 22 22395754 T C 2.587971e-03 False +rs6519111 22 22424302 A C 1.140800e-03 False +rs77010661 22 22473905 C A 1.226009e-02 False +rs2073447 22 22550450 G C 1.773244e-02 False +rs5757417 22 22561610 C T -6.207024e-03 False +rs6001482 22 22581369 G A -6.272413e-03 False +rs5757569 22 22584678 A G -2.176470e-03 False +rs736898 22 22711786 T C 7.779875e-03 False +rs738881 22 22726372 T C 3.496320e-03 False +rs2051490 22 22762771 C T 1.252501e-02 False +rs433766 22 22769923 G A -1.103632e-02 False +rs361959 22 22869742 A C -2.412657e-03 False +rs362168 22 22871922 A G -2.769974e-03 False +rs4462880 22 22929268 T C -7.035723e-03 False +rs456455 22 23001481 A G 7.524178e-03 False +rs11703025 22 23022520 T C 2.175257e-03 False +rs10854762 22 23064982 A C -1.255076e-02 False +rs2856876 22 23249440 A C 2.085816e-02 False +rs58555503 22 23268677 A G 1.337349e-02 False +rs17514179 22 23279456 C G -1.371401e-02 False +rs468884 22 23282286 C T 4.994329e-03 False +rs9623992 22 23325722 C T 8.506657e-04 False +rs3788338 22 23412058 A G -9.545553e-03 False +rs140504 22 23627369 G A -1.900175e-02 False +rs12168342 22 23644425 G A -9.106953e-04 False +rs131693 22 23649242 G T 1.061643e-03 False +rs3827368 22 23794844 G A -1.198736e-02 False +rs11090252 22 23804670 G T -1.119846e-03 False +rs2330498 22 23819697 T G -1.028722e-02 False +rs5759884 22 23873076 T C 9.509027e-03 False +rs179303 22 23892145 T C 1.351280e-02 False +rs131429 22 23925779 C T -4.127647e-03 False +rs6003815 22 23960187 T C -8.475905e-03 False +rs2070446 22 24035970 T C -1.334318e-03 False +rs5759985 22 24086107 G A -1.652957e-02 False +rs73396542 22 24105789 A G 1.813091e-02 False +rs2298375 22 24106448 A G 1.834095e-03 False +rs6003915 22 24186809 C T -1.426541e-02 False +rs4822446 22 24235360 G A 3.168635e-04 False +rs4822455 22 24255296 T C 1.624252e-02 False +rs144128236 22 24300540 T C -3.225760e-03 False +rs144686326 22 24376584 A G -6.223068e-03 False +rs422674 22 24406778 A C 3.046540e-03 False +rs5996675 22 24618331 G A -6.506681e-04 False +rs5751862 22 24802564 A G -6.695797e-03 False +rs6004171 22 24912232 T C -1.536303e-02 False +rs762283 22 24943582 A G -1.687764e-03 False +rs2006092 22 24995668 G A -3.537331e-02 False +rs5760609 22 25123505 C T -1.600990e-02 False +rs5760620 22 25145094 T C -5.584047e-03 False +rs1892723 22 25145453 T C -1.388536e-03 False +rs5760661 22 25185823 A G -9.228375e-03 False +rs11703103 22 25265972 A G 1.088906e-02 False +rs139766 22 25309448 A G -2.238693e-03 False +rs5752027 22 25363411 A G 4.035775e-03 False +rs34259162 22 25410895 G A 9.720734e-04 False +rs16979472 22 25442369 C T 1.660527e-02 False +rs9612844 22 25454658 C A 1.200285e-02 False +rs6004418 22 25465065 C T 1.320801e-02 False +rs4627697 22 25524916 C T 1.147501e-02 False +rs13055430 22 25603008 T C -1.262741e-02 False +rs7286982 22 25619025 G T -1.212511e-02 False +rs5752084 22 25621591 T C 1.051851e-02 False +rs11703955 22 25643483 T G 1.373474e-02 False +rs9612921 22 25661725 A G -5.936431e-03 False +rs6004519 22 25667883 G A 1.547775e-02 False +rs5996879 22 25668730 A C 2.616493e-02 False +rs67839603 22 25678577 T C 3.040180e-02 False +rs79854676 22 25761309 T C -1.760112e-03 False +rs713847 22 25761936 T C -5.171998e-03 False +rs571663 22 25938977 T C 1.966116e-02 False +rs1008673 22 25994013 A G 6.268228e-04 False +rs718163 22 26081873 T C 5.232603e-02 False +rs10212011 22 26132612 A G -6.457239e-03 False +rs133847 22 26133775 T C -1.181527e-03 False +rs133885 22 26159289 A G -8.399401e-03 False +rs3859870 22 26181767 C T 1.044769e-02 False +rs5761201 22 26190915 G A 4.287533e-03 False +rs5761256 22 26218164 G A -2.803502e-03 False +rs17704912 22 26231312 C G 6.105629e-03 False +rs2269632 22 26237826 C T 4.981479e-03 False +rs5761268 22 26239850 A C 4.144037e-03 False +rs4822668 22 26273893 C G 5.616213e-03 False +rs695809 22 26278128 G T -3.965338e-03 False +rs2157538 22 26280462 T C -8.324497e-04 False +rs6004814 22 26290588 T C -1.307320e-02 False +rs973523 22 26292659 G A 4.294309e-05 False +rs2072006 22 26343593 G A 7.813758e-03 False +rs9306419 22 26369358 T C -4.836650e-03 False +rs2331198 22 26390964 A G -7.849451e-03 False +rs5752254 22 26415475 T C -1.219281e-03 False +rs5752262 22 26456367 G A -1.285326e-02 False +rs56116806 22 26460519 T C -8.695338e-03 False +rs78711257 22 26528054 A G 1.973023e-02 False +rs5752282 22 26617260 T A -1.384025e-02 False +rs4438594 22 26638906 G T 1.229772e-02 False +rs5761484 22 26735648 A G 7.879673e-04 False +rs5752316 22 26782251 G A 5.096459e-04 False +rs7289238 22 26812632 C T -1.850814e-02 False +rs732933 22 26939781 C T -9.222796e-04 False +rs2267091 22 26960648 A C -5.679255e-03 False +rs5752371 22 27038865 T G -1.487706e-04 False +rs5752372 22 27042828 A G 2.957737e-02 False +rs1476035 22 27161060 A G 2.844558e-03 False +rs56278657 22 27191643 T C 8.953731e-03 False +rs739226 22 27216426 G A 9.120990e-03 False +rs4822804 22 27217018 A G 1.510616e-02 False +rs136511 22 27240025 T G -2.971740e-02 False +rs136516 22 27242642 G A -9.822927e-03 False +rs136535 22 27246070 C T -1.554199e-03 False +rs9306427 22 27252454 C T -6.560251e-03 False +rs5761797 22 27264880 G T -1.323094e-02 False +rs4822824 22 27337886 A G -9.600014e-03 False +rs739257 22 27339284 T C -9.944488e-03 False +rs5761864 22 27353810 T C -2.171555e-03 False +rs5761885 22 27370273 T C -9.798478e-03 False +rs7288253 22 27378884 A G 5.145072e-02 False +rs7287426 22 27398749 C T 1.012263e-03 False +rs9613339 22 27403571 C T -1.745865e-02 False +rs2516086 22 27405012 T C -5.425419e-03 False +rs17343637 22 27415255 C T -1.499362e-02 False +rs60259956 22 27426628 G C 2.289460e-02 False +rs9620654 22 27430724 A G -7.068064e-03 False +rs760526 22 27435577 C T -8.632412e-03 False +rs4822847 22 27487580 G A 3.691502e-03 False +rs5761976 22 27498426 A G -6.801544e-03 False +rs11704703 22 27526095 G A -8.086267e-04 False +rs9625170 22 27563274 C A 1.369650e-02 False +rs9306437 22 27584680 A G -2.139188e-03 False +rs546339 22 27628151 C G 2.130389e-02 False +rs134786 22 27652290 T G 4.815735e-03 False +rs760593 22 27660675 A G 4.899654e-03 False +rs134810 22 27674832 G T 1.248065e-04 False +rs736950 22 27718775 A G 2.292384e-02 False +rs568561 22 27729742 G A 4.951261e-03 False +rs6519705 22 27762155 C T 4.856660e-03 False +rs6005412 22 27781736 A C -8.336242e-03 False +rs5752545 22 27829565 G A 2.854090e-03 False +rs16984654 22 27832985 G C -1.668955e-02 False +rs4822878 22 27836311 G A -7.756250e-03 False +rs7288006 22 27839704 T C -2.492106e-02 False +rs5762173 22 27864471 A C 2.189950e-03 False +rs10439912 22 27873024 G A 2.721729e-03 False +rs5762194 22 27883265 G A 2.961735e-02 False +rs6005471 22 27890684 A G -8.057355e-03 False +rs761596 22 27927298 T C 2.054268e-02 False +rs5997265 22 27934290 G A 4.751755e-03 False +rs5762249 22 27951176 A G -4.329547e-04 False +rs762064 22 27974819 C A 1.439093e-02 False +rs4822917 22 27975451 G A -3.648208e-02 False +rs6005524 22 28007741 C T -1.635917e-02 False +rs1885362 22 28016883 C A 8.564085e-03 False +rs8135014 22 28046561 T C 1.535905e-02 False +rs9608638 22 28060034 A G 3.097228e-02 False +rs134110 22 28076058 C T 2.848654e-02 False +rs1885364 22 28094845 G A -2.659077e-02 False +rs7291248 22 28130130 C T -1.640387e-02 False +rs2079095 22 28136977 A C -3.962775e-03 False +rs4822935 22 28150109 G A 6.071392e-04 False +rs2283844 22 28150815 A G 1.604724e-02 False +rs2267106 22 28151825 A G -5.390282e-03 False +rs2267113 22 28155404 T C 5.030388e-03 False +rs4822939 22 28172577 G T 5.704168e-03 False +rs12166473 22 28185452 G T -6.896853e-03 False +rs5752639 22 28200176 G A -6.474674e-03 False +rs11705555 22 28206912 C A -6.175542e-03 False +rs5997320 22 28270372 G T -6.768204e-04 False +rs742547 22 28412908 G T 1.763639e-02 False +rs77885044 22 28501414 T C -2.304747e-01 False +rs1884816 22 29106733 C T -1.074749e-02 False +rs132549 22 29318724 T C 1.743333e-03 False +rs17518058 22 29378610 C T 6.690876e-04 False +rs134620 22 29478760 C T -3.029428e-02 False +rs34920087 22 29533572 G C -1.269604e-02 False +rs111625211 22 29626515 A G -1.171130e-02 False +rs3950176 22 29630337 A G 2.658049e-02 False +rs4820803 22 29669648 C G -8.550535e-03 False +rs131190 22 29692497 T G 1.234896e-03 False +rs3804076 22 29837537 C T 1.321112e-02 False +rs467768 22 29961986 T G 1.878853e-03 False +rs140130 22 30151687 C T 3.418302e-03 False +rs76013375 22 30163526 G A 1.576261e-02 False +rs2412971 22 30494371 A G 7.959801e-03 False +rs713875 22 30592487 G C -1.047403e-01 False +rs76168543 22 30621613 A C -1.382104e-02 False +rs55816744 22 30658082 C T -3.794014e-02 False +rs4823086 22 30688659 T C 2.257140e-02 False +rs740223 22 30762140 A G 2.079806e-02 False +rs757660 22 30793137 A G -4.609306e-03 False +rs9608956 22 30901592 C T -8.334040e-03 False +rs5749118 22 30927975 T C 3.226189e-03 False +rs2267161 22 30953295 T C -7.685790e-03 False +rs4820875 22 30992651 G A -2.565800e-02 False +rs1131603 22 31018975 C T 4.241226e-02 False +rs5997714 22 31032920 G A -2.311985e-02 False +rs34597012 22 31063804 G GT -2.081808e-04 False +rs136382 22 31114086 G T 2.825476e-02 False +rs5753303 22 31139653 A G 2.640129e-06 False +rs136230 22 31214382 G A 1.137657e-02 False +rs57527354 22 31216506 C T 5.531311e-03 False +rs67441859 22 31272930 T C -1.056118e-03 False +rs3747151 22 31333631 C T -1.235089e-02 False +rs4820921 22 31378447 A G 1.020507e-02 False +rs715297 22 31442308 A G -2.479126e-03 False +rs11089487 22 31477361 C G -1.263667e-02 False +rs5753465 22 31514348 G A 5.803240e-03 False +rs2240432 22 31521404 A G 1.097391e-02 False +rs5749244 22 31659495 C T 2.663412e-02 False +rs7289941 22 31884405 C T -3.950834e-04 False +rs41311139 22 32200849 T C 1.585735e-02 False +rs7290696 22 32341684 T C -2.960328e-02 False +rs8139657 22 32559835 G A -2.170436e-02 False +rs7291990 22 32569263 C T -1.296006e-03 False +rs5998321 22 32624139 C T 5.619574e-03 False +rs5753956 22 32702816 A G -1.534023e-02 False +rs201161881 22 32756652 G A 2.512177e-02 False +rs2076050 22 32831540 T C 1.868495e-03 False +rs2076054 22 32832874 T C 6.028815e-05 False +rs9609559 22 32853660 G A 1.382210e-02 False +rs62241183 22 32854391 C A 1.960825e-04 False +rs11107 22 32875190 A G -6.426637e-03 False +rs11341975 22 32934713 C CT -9.057754e-03 False +rs2157189 22 32952012 A C -3.802480e-03 False +rs2710386 22 32954443 G A 2.210369e-03 False +rs62232741 22 32993032 C T -2.429979e-03 False +rs966964 22 32997766 T C -8.424246e-03 False +rs62234573 22 33045573 T C -3.107145e-02 False +rs762899 22 33046110 G C -6.954732e-02 False +rs80186738 22 33048039 T C 1.138346e-02 False +rs4821083 22 33056341 C T -6.477198e-02 False +rs997120 22 33108536 T C -3.426392e-02 False +rs7286819 22 33108981 T C -7.404035e-02 False +rs743743 22 33116435 T C 6.542471e-02 False +rs2157133 22 33143528 G A 2.195059e-02 False +rs58039541 22 33146363 A G 8.105390e-04 False +rs5749529 22 33259625 C T 2.309793e-02 False +rs137560 22 33336039 T G -2.554387e-02 False +rs9609680 22 33408519 T C -7.556300e-03 False +rs4821137 22 33660345 C G 2.190743e-03 False +rs117531661 22 33804893 C T 6.680774e-03 False +rs5754555 22 33844303 C T 8.923314e-03 False +rs9609802 22 33846914 T C 6.295378e-03 False +rs62225321 22 33898906 A C 1.958759e-05 False +rs86487 22 34022284 A G -2.579330e-03 False +rs239333 22 34137784 G A 4.460828e-03 False +rs5999111 22 34208570 T C -3.365869e-03 False +rs9941961 22 34217757 T C 9.289431e-03 False +rs10854640 22 34256923 A C 1.439384e-02 False +rs79442817 22 34265402 G A -1.636610e-02 False +rs5754747 22 34284173 G A -2.315559e-02 False +rs2157153 22 34296093 C A -4.688326e-03 False +rs12169215 22 34378012 A G 2.276664e-03 False +rs242898 22 34436795 C T 1.337033e-04 False +rs2413215 22 34488452 A G -4.288310e-04 False +rs135198 22 34501541 A G 2.763614e-03 False +rs243001 22 34514810 C A 3.976601e-03 False +rs130668 22 34526428 C T 1.088864e-02 False +rs5999246 22 34583078 A G 1.802495e-03 False +rs753600 22 34620754 T C 1.466546e-02 False +rs2097307 22 34691035 A G -2.082615e-04 False +rs411451 22 34758540 T C 5.165532e-03 False +rs2609850 22 34851377 A C 1.371180e-02 False +rs737821 22 35371707 T C -4.985554e-04 False +rs35433006 22 35382268 A C -4.931336e-03 False +rs7292124 22 35419122 C T -1.077953e-02 False +rs8140287 22 35478529 A G 1.760523e-04 False +rs61735502 22 35481493 T C 1.056439e-02 False +rs80730 22 35526281 G A -2.766891e-03 False +rs61134707 22 35603836 A G -1.783939e-04 False +rs1053593 22 35660875 T G 3.988231e-02 False +rs6518950 22 35745196 G T 1.750545e-04 False +rs17793276 22 35750980 A G -7.651136e-03 False +rs2071749 22 35783413 G A 1.649791e-03 False +rs35806646 22 35918270 C T 6.918713e-03 False +rs5750115 22 35959242 A G 1.697538e-02 False +rs738368 22 35962060 G A 5.181476e-03 False +rs926338 22 35964158 G C 2.769931e-03 False +rs5995124 22 35984385 A G -1.280623e-02 False +rs4820205 22 36001258 C T 1.342405e-02 False +rs4327313 22 36072262 T C 4.895490e-03 False +rs6000004 22 36180535 G A -3.250252e-02 False +rs78188544 22 36517307 C T 1.366076e-02 False +rs6000142 22 36519596 A C -3.499560e-03 False +rs9610403 22 36532058 A G -1.214487e-02 False +rs3788518 22 36543489 C G 7.838149e-03 False +rs6000190 22 36600841 G A 2.644389e-02 False +rs2010659 22 36629633 C A -6.871468e-03 False +rs136145 22 36635967 G A -2.634742e-02 False +rs78188930 22 36655735 A G -5.385142e-03 False +rs136176 22 36661646 A G -1.560741e-02 False +rs2269529 22 36684354 C T -5.170111e-03 False +rs75138027 22 36705622 A G 1.713234e-02 False +rs3842715 22 36708049 C CTCCTGTGA -5.187051e-02 False +rs11089788 22 36751101 A C -2.440650e-02 False +rs16996704 22 36764788 G A 2.784116e-02 False +rs5756223 22 36897427 C T 2.603792e-02 False +rs760718 22 36900806 G A 7.366207e-03 False +rs6000293 22 36923144 T C -1.875563e-03 False +rs5995298 22 36924714 G A -3.632594e-03 False +rs140020 22 36946643 T G 1.333137e-02 False +rs4821501 22 36954939 T C 1.105894e-02 False +rs5756255 22 36998907 T C -6.084687e-04 False +rs9622429 22 37001495 G T -1.224147e-02 False +rs2267348 22 37013167 G C 1.866849e-02 False +rs6000386 22 37077364 C T 7.294257e-03 False +rs738514 22 37080738 C G -4.873355e-03 False +rs2746971 22 37101890 C T 3.991764e-02 False +rs933229 22 37118535 A G -1.713909e-03 False +rs62230508 22 37184521 G A 6.515894e-03 False +rs4820254 22 37206341 G T 2.566936e-04 False +rs11089806 22 37256262 A G 1.152626e-03 False +rs4821544 22 37258503 C T -9.761102e-03 False +rs909486 22 37323988 T C -7.318200e-03 False +rs1534882 22 37329545 G A 5.775806e-03 False +rs131843 22 37337409 T C -2.534399e-02 False +rs2093380 22 37343000 A C -4.011777e-04 False +rs743749 22 37398195 T C -1.001198e-02 False +rs2413447 22 37401532 A G -3.244795e-03 False +rs11554714 22 37407109 C G 4.335972e-02 False +rs2543523 22 37477732 T C 3.669548e-04 False +rs1861947 22 37507019 A G -9.259451e-04 False +rs28450477 22 37513316 A G 1.153887e-03 False +rs3218297 22 37532441 A G 1.802306e-02 False +rs2543529 22 37571497 G A -5.785311e-03 False +rs7290488 22 37581383 T C 3.172492e-02 False +rs9798725 22 37621269 C A 4.460405e-03 False +rs10212068 22 37644621 T C -8.386907e-03 False +rs730422 22 37671896 A G 2.303688e-02 False +rs1041895 22 37679763 G A -2.658396e-03 False +rs1008184 22 37720268 G A 2.120184e-02 False +rs2069221 22 37753256 C T 8.984539e-03 False +rs4821645 22 37757099 G A -1.560347e-02 False +rs9610727 22 37780522 C G -1.496708e-02 False +rs9607459 22 37800175 T C -5.510833e-03 False +rs6000739 22 37846448 G A 1.152963e-02 False +rs742152 22 37896749 C T 5.447068e-03 False +rs6000756 22 37908435 C T 1.909131e-03 False +rs12167061 22 37977481 T C 1.465308e-02 False +rs75937893 22 37992699 G A 8.339179e-04 False +rs36120988 22 38032762 G GA 1.693041e-02 False +rs9622677 22 38054262 C A 4.354146e-02 False +rs4820295 22 38083101 C T -2.092117e-02 False +rs12628603 22 38119213 A G 3.948165e-02 False +rs5756795 22 38122122 C T 4.377277e-02 False +rs79849571 22 38204089 T C 2.977743e-02 False +rs117267625 22 38435786 T G -7.684278e-03 False +rs2284063 22 38544298 G A 5.090446e-02 False +rs4608623 22 38597378 T G -1.997927e-02 False +rs4444637 22 38606780 G A -9.182016e-03 False +rs5995554 22 38630272 C T 7.393137e-03 False +rs135720 22 38663819 G A -6.392021e-03 False +rs135730 22 38673234 A G -1.106705e-02 False +rs35336050 22 38685131 C T -4.493352e-03 False +rs5750581 22 38695406 T C -1.155972e-02 False +rs56182369 22 38708506 A G 1.701713e-02 False +rs5757057 22 38744184 C T -2.112956e-02 False +rs743942 22 38819613 A G -5.625806e-03 False +rs12004 22 38877461 G T 1.108728e-03 False +rs5750616 22 38918894 G T -8.094286e-03 False +rs112010490 22 38928269 G T -2.114917e-02 False +rs35069730 22 39027286 C CAG 3.840735e-03 False +rs3747172 22 39067524 G A 1.200232e-02 False +rs5757275 22 39159201 C T 3.096214e-03 False +rs760482 22 39178701 G A 2.148449e-03 False +rs735306 22 39260032 T C 3.574634e-02 False +rs760481 22 39268785 T G 9.377414e-03 False +rs5750691 22 39281774 G T 3.816951e-02 False +rs5757355 22 39300265 C T 3.540156e-02 False +rs1014971 22 39332623 T C -4.449842e-03 False +rs5757424 22 39415780 G A 1.479946e-02 False +rs35860424 22 39448465 A G 3.065974e-03 False +rs2011869 22 39480697 G A -4.005617e-02 False +rs139272 22 39487665 G A -1.218988e-04 False +rs55989856 22 39493294 C T -3.115929e-02 False +rs738469 22 39510995 G A -2.069106e-02 False +rs877529 22 39542292 A G 9.653575e-03 False +rs73884827 22 39543000 T C -4.069841e-03 False +rs7287160 22 39573724 A C 2.683694e-02 False +rs5750761 22 39575692 A C 1.451305e-02 False +rs738470 22 39581277 A C 1.766406e-02 False +rs13053714 22 39626572 A G -2.901981e-02 False +rs5757580 22 39658626 C T 4.177065e-03 False +rs1569497 22 39665395 G A 1.264611e-02 False +rs54211 22 39687484 G A 5.418141e-03 False +rs6519183 22 39708279 A G -4.281532e-02 False +rs5757611 22 39708357 T C 8.605574e-03 False +rs5750811 22 39793066 G T 3.658209e-02 False +rs34026806 22 39798127 G A 2.302129e-03 False +rs5757678 22 39843409 T C 1.065699e-02 False +rs6001601 22 39865475 G A 1.588501e-03 False +rs5757703 22 39932516 A G -1.179841e-02 False +rs62228477 22 39963426 G A -1.503908e-02 False +rs11704409 22 40023636 C T 6.443146e-03 False +rs136829 22 40046176 C T -7.416552e-04 False +rs5757764 22 40067818 T C 4.559360e-03 False +rs5757777 22 40092864 G A 2.400297e-02 False +rs5757783 22 40127293 T C -8.870038e-04 False +rs7285609 22 40358148 T C -1.079902e-02 False +rs8139715 22 40420786 G C -8.092115e-03 False +rs7291691 22 40454069 G T 7.898880e-03 False +rs732384 22 40541981 G A 1.742640e-02 False +rs12484776 22 40652873 G A 5.853057e-03 False +rs28360630 22 40676672 G T -1.894274e-03 False +rs470113 22 40729614 G A 1.959940e-02 False +rs5757949 22 40820151 C T -1.628066e-02 False +rs35898643 22 40986372 G C -1.983507e-02 False +rs12165625 22 41494925 A G -2.918069e-02 False +rs11703267 22 41646738 G A 3.521847e-04 False +rs8139705 22 41680898 T C 1.402732e-02 False +rs34011394 22 41704872 T C 6.681484e-05 False +rs2073167 22 41791536 C T -5.572333e-05 False +rs2076196 22 41895409 A G -4.407217e-02 False +rs2076198 22 41929175 G T -3.186844e-02 False +rs739134 22 42089623 C T 5.322340e-03 False +rs147348682 22 42095658 G T 3.846131e-02 False +rs139568 22 42210985 C T -3.139710e-03 False +rs13055841 22 42279653 G A -6.596336e-03 False +rs7293091 22 42341308 G A -6.862491e-04 False +rs35742686 22 42524243 C CT -1.181191e-02 False +rs762995 22 42672124 G A -5.278171e-03 False +rs1548304 22 42691238 T C -1.642396e-02 False +rs8139063 22 42813753 C T -3.867750e-03 False +rs5758742 22 42867898 G A -1.352327e-03 False +rs11553441 22 42912097 T C -7.295657e-04 False +rs4822160 22 42932317 A G -5.768556e-02 False +rs28627172 22 43010817 A G 1.722077e-02 False +rs130370 22 43080028 T C -5.527551e-04 False +rs6002910 22 43096507 T C -5.556102e-03 False +rs738526 22 43112475 T C -1.350273e-02 False +rs8138149 22 43114824 G A -1.963192e-02 False +rs5758896 22 43115576 C T -1.880097e-02 False +rs9623692 22 43154299 G A -1.621113e-03 False +rs9611885 22 43159948 T C -7.980584e-03 False +rs1018448 22 43206950 C A -5.783037e-03 False +rs9607957 22 43218397 C T -3.976636e-03 False +rs2267463 22 43283255 C A -1.426668e-02 False +rs4822220 22 43290583 C T -3.955775e-02 False +rs8140884 22 43333156 A G -3.127845e-02 False +rs6003002 22 43426262 G A -3.668040e-03 False +rs8141749 22 43483242 T C -2.540203e-02 False +rs4988388 22 43515108 C T -1.570749e-02 False +rs13815 22 43529314 C G 1.738127e-02 False +rs5759199 22 43551513 G A 2.565386e-02 False +rs6972 22 43558972 A G -1.962819e-02 False +rs4822262 22 43577214 T C -2.270478e-02 False +rs13058467 22 43579049 C T -1.193909e-03 False +rs138993 22 43610207 G A -7.621661e-03 False +rs129415 22 43623395 G C -4.852519e-02 False +rs11703272 22 43640512 C T -5.533207e-03 False +rs139027 22 43649701 C T 7.724845e-02 False +rs5751462 22 43661080 T C -4.251741e-02 False +rs739306 22 43683088 A G -3.582388e-03 False +rs4820518 22 43707996 A G -2.547044e-02 False +rs6519367 22 43711080 C G -5.784446e-03 False +rs6003156 22 43721519 C A 3.658850e-04 False +rs1894717 22 43729401 C T 8.557013e-03 False +rs4820525 22 43763757 T G -1.789810e-02 False +rs28673361 22 43836198 G T 2.427697e-03 False +rs9614382 22 43976396 A G -1.277457e-02 False +rs137731 22 44031042 C T 3.593107e-03 False +rs9614187 22 44193626 C A -6.865434e-03 False +rs138057 22 44221247 G A 1.833991e-02 False +rs4823156 22 44296372 T C 6.169212e-03 False +rs6006453 22 44298838 A G 7.441756e-03 False +rs2294918 22 44342116 G A 2.810328e-02 False +rs3761472 22 44368122 G A 1.299680e-02 False +rs8418 22 44379838 G A 1.648422e-03 False +rs6006598 22 44380033 C T -2.136788e-03 False +rs1007863 22 44395451 C T -6.698507e-03 False +rs7285340 22 44419871 C T 1.816130e-02 False +rs6006622 22 44424108 T C 1.036733e-02 False +rs130313 22 44467899 C T -2.592364e-03 False +rs9614325 22 44498134 T C 7.281423e-03 False +rs1535009 22 44522312 C T -2.636447e-04 False +rs4823194 22 44526130 G A -3.882980e-03 False +rs2267613 22 44530286 A G 2.528159e-02 False +rs2267614 22 44530420 C T -1.233654e-02 False +rs10483222 22 44548944 G A -3.947209e-03 False +rs77120395 22 44551755 G A 1.262458e-02 False +rs9614359 22 44566434 A G -4.290306e-03 False +rs139131 22 44581046 T C -1.479950e-02 False +rs9626137 22 44643161 C T 1.439493e-02 False +rs135400 22 44677081 C T -1.030513e-02 False +rs135388 22 44681612 G A -1.269762e-03 False +rs3935378 22 44695088 T C 6.324859e-03 False +rs6519840 22 44707716 G T 2.288939e-03 False +rs62228577 22 44725343 G A 3.534678e-03 False +rs6519897 22 44738406 G A 2.320049e-02 False +rs7289501 22 44746729 A G -1.754216e-02 False +rs5764718 22 44751158 G A -6.539695e-03 False +rs9614538 22 44757439 A G 2.480295e-02 False +rs9614823 22 44759519 G A 2.111274e-03 False +rs5765809 22 44761797 A T -5.311720e-03 False +rs5764921 22 44763352 C G 1.452737e-02 False +rs19985 22 44783779 G A 9.142699e-03 False +rs2071820 22 44791807 C T -2.371876e-02 False +rs2746583 22 44818986 C T -6.740622e-03 False +rs5765690 22 44894913 G A -5.179871e-05 False +rs4508 22 45058431 C T 1.098259e-02 False +rs6006845 22 45066035 A G -1.484374e-02 False +rs9614870 22 45069410 T C 1.530441e-02 False +rs41515447 22 45081330 G A 1.350120e-03 False +rs28460735 22 45082168 C A 3.663354e-03 False +rs4823364 22 45090008 G A 2.811861e-03 False +rs6006857 22 45116664 C T 1.247728e-02 False +rs2269543 22 45244930 T C -1.450041e-02 False +rs8881 22 45258457 G A -3.500519e-03 False +rs9614987 22 45323989 T C 1.111338e-03 False +rs140556 22 45415987 A G -1.398184e-02 False +rs132067 22 45451355 G A -5.566982e-03 False +rs5765155 22 45471607 C T 1.148978e-02 False +rs5765167 22 45497738 C T -5.029327e-03 False +rs7292035 22 45502829 C T -3.893521e-02 False +rs2018928 22 45519040 T G 2.377071e-03 False +rs6006941 22 45523391 A G 1.318997e-02 False +rs17548742 22 45573450 C A 4.385600e-03 False +rs1125398 22 45589490 G A -8.350439e-03 False +rs58667 22 45668012 T C 1.286879e-02 False +rs5765242 22 45671343 G A -2.940682e-06 False +rs2742648 22 45672574 T C 5.743608e-03 False +rs5765250 22 45693923 A G -2.675069e-03 False +rs7290139 22 45718743 G A -2.092804e-02 False +rs11556482 22 45723807 C G 1.670159e-03 False +rs6007594 22 45728370 A G 1.879231e-04 False +rs56343022 22 45741537 G T 1.420045e-02 False +rs5764698 22 45749983 T G -4.591012e-02 False +rs2272804 22 45809624 A C 2.185772e-03 False +rs2142662 22 45821935 A G 2.250782e-02 False +rs6007041 22 45837410 G A -2.756449e-03 False +rs11090631 22 45846371 T C 7.910102e-02 False +rs713975 22 45864934 T C 8.535181e-03 False +rs10483228 22 45871507 G C -7.764056e-03 False +rs5765426 22 45892656 G T -3.885653e-03 False +rs3810631 22 45897997 C T 3.935204e-04 False +rs105199 22 45929577 C T -2.532217e-02 False +rs136755 22 45936350 A G -8.001698e-03 False +rs5765463 22 45942726 T G -1.415551e-02 False +rs13268 22 45996298 G A 5.643525e-02 False +rs17564843 22 46009063 G A 6.464843e-03 False +rs5765546 22 46022070 G A 2.246740e-02 False +rs2239398 22 46155548 G C -3.247470e-02 False +rs136018 22 46207955 C T -1.354554e-03 False +rs136029 22 46236425 A G 8.398423e-02 False +rs57514815 22 46275529 T C 2.264300e-03 False +rs75427302 22 46287720 A G -2.237482e-02 False +rs28473346 22 46289699 T C 1.872124e-02 False +rs9697736 22 46303347 T C -1.283734e-02 False +rs28663466 22 46316057 A G 2.312579e-02 False +rs9286453 22 46337043 G C 1.701173e-02 False +rs75862558 22 46347519 C T 1.574289e-02 False +rs9330813 22 46364161 A G -4.466341e-02 False +rs62228062 22 46381234 G A 4.730559e-02 False +rs28628653 22 46396925 G A 1.783944e-03 False +rs28698504 22 46403715 A G -2.132589e-02 False +rs78358349 22 46406782 A C 8.439466e-02 False +rs9627368 22 46445002 G C -7.613496e-02 False +rs7292297 22 46458123 G T 3.328073e-02 False +rs9626891 22 46482948 C T 4.241879e-02 False +rs12160757 22 46486508 C T -9.684390e-03 False +rs3747243 22 46493852 T C -6.758580e-03 False +rs9616125 22 46499120 C G -9.873118e-03 False +rs12170325 22 46502870 T C -1.792140e-02 False +rs76755807 22 46561713 G A 2.604703e-02 False +rs4253701 22 46586110 A G -1.256735e-03 False +rs59842914 22 46592168 C T 1.417055e-02 False +rs1800206 22 46614274 G C -5.854014e-02 False +rs4253772 22 46627603 T C 8.004024e-02 False +rs35364389 22 46760086 T C 3.229515e-03 False +rs34267201 22 46782382 T C -2.470821e-02 False +rs9627450 22 46807234 C T 2.324176e-03 False +rs9306514 22 46837114 G A 9.440730e-04 False +rs5768830 22 46888399 T C 9.911095e-03 False +rs9615374 22 46907779 G A 6.531440e-03 False +rs4823838 22 46909355 T G -4.780494e-03 False +rs12484501 22 46914277 A C 9.689535e-03 False +rs3810636 22 46943687 G A -1.303660e-02 False +rs9627514 22 46985917 A G 1.893397e-02 False +rs9615396 22 47021226 G A -1.322949e-02 False +rs13057352 22 47095235 A C -1.156013e-01 False +rs13054785 22 47109621 C T 4.322858e-04 False +rs34301321 22 47125474 G A -1.746025e-02 False +rs17221476 22 47147117 T C -2.418349e-02 False +rs5769136 22 47156703 C T 2.628970e-02 False +rs6008990 22 47245836 A G 1.880575e-03 False +rs140535 22 47271747 C T 1.055264e-03 False +rs5767397 22 47301822 C T 3.032158e-03 False +rs9616173 22 47345487 T C -2.945945e-03 False +rs470059 22 47372368 T C 2.067644e-02 False +rs136120 22 47380606 C T 4.041426e-02 False +rs5769300 22 47437808 C T 1.683027e-03 False +rs131924 22 47450911 A G 1.624479e-02 False +rs910541 22 47511864 A C -4.226735e-03 False +rs2295246 22 47519476 T C -3.954111e-03 False +rs13055207 22 47529458 A G -3.602848e-04 False +rs4823597 22 47531320 T C -6.899703e-03 False +rs738669 22 47548321 T C 4.925401e-03 False +rs2337244 22 47568291 C T 7.726693e-03 False +rs15646 22 47571203 A G -9.744751e-03 False +rs135368 22 47574009 C T -5.327010e-03 False +rs136618 22 47642100 T C 6.976251e-03 False +rs136636 22 47657635 T C 1.798943e-03 False +rs6008118 22 47683805 C T -3.475544e-02 False +rs36008375 22 47720973 T C -7.868172e-03 False +rs17763944 22 47821952 G A -8.854280e-04 False +rs2301382 22 47893053 A G -2.449056e-02 False +rs5767784 22 47935365 C T -1.599879e-03 False +rs2285093 22 47961708 G T -3.593525e-03 False +rs131114 22 47986332 T C -3.976592e-03 False +rs9615626 22 48154645 C T 7.608639e-03 False +rs5845816 22 48165452 C CT 2.039503e-03 False +rs16994709 22 48207318 T C -9.725168e-03 False +rs4823698 22 48213904 G C -1.220367e-02 False +rs9615649 22 48215904 A G -2.488244e-05 False +rs738739 22 48220460 T C -2.702163e-03 False +rs738743 22 48230941 C A -1.129522e-03 False +rs4823717 22 48271961 A G -5.053446e-03 False +rs2338258 22 48284025 T C -3.344182e-03 False +rs5768135 22 48297953 C T -1.046958e-02 False +rs1028528 22 48362290 G A -2.367254e-02 False +rs28537386 22 48362914 C A -3.167719e-03 False +rs5768244 22 48387670 A G -8.243989e-03 False +rs7289071 22 48415446 C T 2.130715e-03 False +rs135271 22 48460730 T C 2.682476e-03 False +rs5768344 22 48491160 T C 1.257794e-03 False +rs4823512 22 48519794 C T 3.680757e-03 False +rs6007807 22 48537775 G A 2.134692e-03 False +rs106953 22 48543566 T C 7.314089e-03 False +rs133534 22 48593037 C T 9.084708e-03 False +rs34776844 22 48687509 C T -2.771960e-02 False +rs5768510 22 48692033 T C -2.126264e-02 False +rs62223851 22 48699617 T C 5.093107e-04 False +rs34080684 22 48717568 T C -8.190281e-04 False +rs1475987 22 48811946 C T 7.916515e-03 False +rs7293013 22 48823357 G A 1.464317e-02 False +rs2071750 22 48840428 A C 3.711229e-03 False +rs9615896 22 48851612 T C -5.887765e-03 False +rs13056230 22 48874310 T C -1.106607e-02 False +rs761793 22 48968070 C T 1.280691e-02 False +rs28658383 22 48991385 T C -1.234119e-02 False +rs34694572 22 49004050 G A 2.290755e-02 False +rs28406241 22 49014565 A G 1.555565e-03 False +rs7288241 22 49086481 T C -6.196369e-03 False +rs4989008 22 49107173 T C 1.277272e-02 False +rs131032 22 49180915 A G 6.346977e-03 False +rs4076042 22 49262579 A G 2.657134e-02 False +rs28726380 22 49270317 C T 1.447665e-03 False +rs2024695 22 49313196 A G -7.055532e-03 False +rs1467436 22 49335230 T C -6.548281e-03 False +rs4824067 22 49366123 T C 1.136486e-02 False +rs738596 22 49372356 G C -2.420841e-02 False +rs17178683 22 49443666 T C 1.581736e-02 False +rs55898343 22 49496835 G A -1.355414e-02 False +rs1981477 22 49524428 A G -4.228482e-03 False +rs135257 22 49530553 G C 8.197389e-03 False +rs9627875 22 49537845 T C 1.112550e-02 False +rs5769975 22 49557457 G A 9.401926e-03 False +rs5769981 22 49562666 C A 1.271701e-02 False +rs2318943 22 49574509 C T 4.703177e-04 False +rs5769446 22 49579141 A G 2.448619e-02 False +rs7288983 22 49650863 T C 6.739571e-03 False +rs5770154 22 49662549 T G -5.769464e-03 False +rs1880009 22 49665841 T C -7.037069e-04 False +rs62220604 22 49677464 A G -2.177735e-02 False +rs6009594 22 49696067 C T -3.309682e-03 False +rs5770223 22 49700272 T G -2.541948e-03 False +rs1124544 22 49706433 T C -1.719402e-02 False +rs73173197 22 49713835 G A -1.370754e-02 False +rs848761 22 49719264 A C -1.067852e-02 False +rs848721 22 49743627 G A -5.970581e-04 False +rs9628005 22 49800265 C T 3.098582e-02 False +rs6009666 22 49806863 A G 3.940447e-03 False +rs136795 22 49830851 C T -2.742706e-03 False +rs11705513 22 49834624 G A -2.820163e-03 False +rs6009703 22 49843235 G C -4.458281e-04 False +rs9616311 22 49847501 T G 2.235016e-03 False +rs4823938 22 49861033 C T 1.721243e-02 False +rs5770489 22 49881321 A G -5.127800e-04 False +rs9628037 22 49908804 G A -9.455892e-03 False +rs134474 22 49911222 G T -1.389666e-02 False +rs17779492 22 49925268 A G 1.679984e-02 False +rs134447 22 49927332 T C 3.929800e-04 False +rs111392589 22 50109212 T C 1.610819e-02 False +rs6009846 22 50118149 G C 7.024666e-03 False +rs138844 22 50184484 G T 1.222581e-02 False +rs117613664 22 50219447 T C 5.091891e-02 False +rs910799 22 50278568 G A -2.340672e-02 False +rs78676969 22 50319170 G A 1.669806e-02 False +rs28372448 22 50350971 A G 2.640160e-02 False +rs4077129 22 50356693 C T 3.851499e-03 False +rs5771069 22 50435480 G A 1.663630e-02 False +rs9617098 22 50439626 A G -2.722154e-03 False +rs137890 22 50466542 C T -2.560094e-03 False +rs11101958 22 50470516 T C -1.621986e-02 False +rs5771133 22 50491150 G A 1.828674e-02 False +rs6010164 22 50515270 C T 1.439904e-02 False +rs56144269 22 50529850 C T 2.054628e-02 False +rs75570992 22 50570755 C G 7.077514e-03 False +rs2272837 22 50582626 G A -3.588854e-03 False +rs17836662 22 50672154 A G 7.660848e-03 False +rs11547731 22 50722134 C T -1.747164e-02 False +rs79966207 22 50722408 C T -1.063465e-03 False +rs28379706 22 50728062 C T 2.159223e-02 False +rs11553142 22 50750481 T C 1.877272e-02 False +rs62241237 22 50758873 T C 4.001731e-03 False +rs9628184 22 50835040 A G -6.374259e-03 False +rs9616997 22 50859049 C T 3.480749e-04 False +rs1053744 22 50885775 G A -1.358311e-02 False +rs2232883 22 50926768 T C 1.798498e-03 False +rs2232885 22 50928026 A G 4.775504e-03 False +rs140522 22 50971266 C T 2.160893e-02 False +rs41281529 22 50989197 T C -1.328884e-02 False +rs131778 22 50989326 G A 1.037054e-02 False +rs5770892 22 50999681 G A -1.226224e-02 False +rs35826039 22 51046163 T C -2.754002e-02 False +rs9616915 22 51117580 C T 3.573542e-02 False +rs2301584 22 51171497 A G -1.951606e-02 False +rs73174435 22 51174939 T C -6.178519e-03 False diff --git a/tests/data/combine/scorefile.txt b/tests/data/combine/scorefile.txt new file mode 100644 index 0000000..1043a68 --- /dev/null +++ b/tests/data/combine/scorefile.txt @@ -0,0 +1,838 @@ +#pgs_name=PGS001229_22 +#genome_build=GRCh37 +chr_name chr_position effect_allele other_allele effect_weight +22 17080378 G A 0.01045457 +22 17300230 A G 0.0001411475 +22 17318864 A C 0.008166266 +22 17327595 T C 0.007791641 +22 17409813 A G 0.0003108784 +22 17450952 G A -0.03033983 +22 17492533 G A 0.00388999 +22 17542810 C T 0.00803629 +22 17565013 G A 0.02135621 +22 17589209 T C 0.003026491 +22 17600977 A G 0.01581277 +22 17625915 A G -0.1172964 +22 17630486 A C 0.01012909 +22 17633785 C T 0.0023255 +22 17643689 A G 0.003361814 +22 17669306 C T 0.0214506 +22 17677699 T C -0.0007031384 +22 17680519 C A 0.001079236 +22 17701234 G A 0.004477145 +22 17703119 A T 0.0007771872 +22 17718699 C A -0.01320632 +22 17721595 C T 0.009480363 +22 17727648 T C 0.007811685 +22 17738177 G A -0.004719812 +22 17749096 A G -0.005244795 +22 17770181 G T -0.03101703 +22 17793969 G A 0.01774444 +22 17815696 G C -0.00551609 +22 17827684 G A -0.005944752 +22 17831813 T C 0.01061587 +22 17844929 T G 0.001717643 +22 17850661 T C -0.02805489 +22 17887534 A G 0.0007723542 +22 17887725 A G 0.007472703 +22 17958221 C A -0.02098647 +22 18036253 G A -0.01772981 +22 18038786 A G -0.002119071 +22 18262301 A T -0.005065485 +22 18289204 A G 0.005306345 +22 18295575 C T 0.02300129 +22 18296238 G A -0.005665446 +22 18319179 T C 0.03440642 +22 18393534 A C 0.01764269 +22 18439958 T C -0.002261707 +22 18483388 G A 0.03318724 +22 18488883 C G -0.0191918 +22 18489048 C A 0.01233198 +22 18495470 A G -0.005804926 +22 18537145 G A -0.004930116 +22 18571008 A G -8.844726E-05 +22 18584433 C T -0.001169893 +22 18631365 T C -0.001551714 +22 18650682 T C -0.01313784 +22 18890037 A G 0.05968921 +22 18891398 G A 0.006891943 +22 18892575 A G -0.00224447 +22 18915963 A G 0.003719756 +22 18959581 T C 0.006464581 +22 18963340 A G -0.01397565 +22 18970915 T C -0.001507131 +22 19024651 T C -0.00350575 +22 19121872 A G 0.01644046 +22 19135603 A G -0.02970077 +22 19190143 T C 0.003268027 +22 19263698 T C 0.02057255 +22 19292446 G T 0.01153989 +22 19371052 T C 0.01055134 +22 19420109 C T -0.008628228 +22 19451186 A C 0.02141029 +22 19518079 C T 0.005372247 +22 19581331 T C 0.01686942 +22 19593854 C A 0.0006544249 +22 19606703 G A 0.02070121 +22 19649005 A G 0.002868601 +22 19735854 C T 0.006262962 +22 19738355 T C 4.97384E-05 +22 19770886 A G -0.01013929 +22 19781823 T C 0.02481609 +22 19873357 T C 0.0116302 +22 19907099 A C -0.0267645 +22 19968597 T C -0.02203945 +22 20046344 G A -0.009801428 +22 20084821 C T -0.02232886 +22 20185457 A G 0.006892171 +22 20189077 T C 0.01738215 +22 20219648 A G 0.009307625 +22 20248391 A G -0.005405845 +22 20267213 A G 0.006713242 +22 20286099 G T 0.01574758 +22 20749042 G A 0.006603339 +22 20754039 A G -0.01181141 +22 20775167 T C 0.01160113 +22 20780296 A G 0.06735311 +22 20789074 C T 0.02844307 +22 20791438 A C 0.0473474 +22 20793914 C T 0.007009781 +22 20839810 T G 0.003947346 +22 20860931 T C 0.0005613511 +22 20979980 G A 0.003231665 +22 20991771 G A 0.004226765 +22 21075537 C A -0.002096453 +22 21154393 G T -0.004297086 +22 21323357 C T -0.006041745 +22 21331918 G C -0.002280912 +22 21334924 C G -0.02031369 +22 21356824 A G 0.01476577 +22 21386019 A G 0.01435557 +22 21449028 G A -0.01537701 +22 21463515 A G -0.01335614 +22 21982892 T C -0.06373335 +22 22001704 T G 0.02809584 +22 22062480 T C 0.0529113 +22 22080735 G A -0.0139426 +22 22151939 C A -0.008287849 +22 22163425 G A 0.05518983 +22 22307519 C G -0.003486191 +22 22351283 G A -0.0007483763 +22 22394291 AG A 0.004320583 +22 22395754 T C 0.002587971 +22 22424302 A C 0.0011408 +22 22473905 C A 0.01226009 +22 22550450 G C 0.01773244 +22 22561610 C T -0.006207024 +22 22581369 G A -0.006272413 +22 22584678 A G -0.00217647 +22 22711786 T C 0.007779875 +22 22726372 T C 0.00349632 +22 22762771 C T 0.01252501 +22 22769923 G A -0.01103632 +22 22869742 A C -0.002412657 +22 22871922 A G -0.002769974 +22 22929268 T C -0.007035723 +22 23001481 A G 0.007524178 +22 23022520 T C 0.002175257 +22 23064982 A C -0.01255076 +22 23249440 A C 0.02085816 +22 23268677 A G 0.01337349 +22 23279456 C G -0.01371401 +22 23282286 C T 0.004994329 +22 23325722 C T 0.0008506657 +22 23412058 A G -0.009545553 +22 23627369 G A -0.01900175 +22 23644425 G A -0.0009106953 +22 23649242 G T 0.001061643 +22 23794844 G A -0.01198736 +22 23804670 G T -0.001119846 +22 23819697 T G -0.01028722 +22 23873076 T C 0.009509027 +22 23892145 T C 0.0135128 +22 23925779 C T -0.004127647 +22 23960187 T C -0.008475905 +22 24035970 T C -0.001334318 +22 24086107 G A -0.01652957 +22 24105789 A G 0.01813091 +22 24106448 A G 0.001834095 +22 24186809 C T -0.01426541 +22 24235360 G A 0.0003168635 +22 24255296 T C 0.01624252 +22 24300540 T C -0.00322576 +22 24376584 A G -0.006223068 +22 24406778 A C 0.00304654 +22 24618331 G A -0.0006506681 +22 24802564 A G -0.006695797 +22 24912232 T C -0.01536303 +22 24943582 A G -0.001687764 +22 24995668 G A -0.03537331 +22 25123505 C T -0.0160099 +22 25145094 T C -0.005584047 +22 25145453 T C -0.001388536 +22 25185823 A G -0.009228375 +22 25265972 A G 0.01088906 +22 25309448 A G -0.002238693 +22 25363411 A G 0.004035775 +22 25410895 G A 0.0009720734 +22 25442369 C T 0.01660527 +22 25454658 C A 0.01200285 +22 25465065 C T 0.01320801 +22 25524916 C T 0.01147501 +22 25603008 T C -0.01262741 +22 25619025 G T -0.01212511 +22 25621591 T C 0.01051851 +22 25643483 T G 0.01373474 +22 25661725 A G -0.005936431 +22 25667883 G A 0.01547775 +22 25668730 A C 0.02616493 +22 25678577 T C 0.0304018 +22 25761309 T C -0.001760112 +22 25761936 T C -0.005171998 +22 25938977 T C 0.01966116 +22 25994013 A G 0.0006268228 +22 26081873 T C 0.05232603 +22 26132612 A G -0.006457239 +22 26133775 T C -0.001181527 +22 26159289 A G -0.008399401 +22 26181767 C T 0.01044769 +22 26190915 G A 0.004287533 +22 26218164 G A -0.002803502 +22 26231312 C G 0.006105629 +22 26237826 C T 0.004981479 +22 26239850 A C 0.004144037 +22 26273893 C G 0.005616213 +22 26278128 G T -0.003965338 +22 26280462 T C -0.0008324497 +22 26290588 T C -0.0130732 +22 26292659 G A 4.294309E-05 +22 26343593 G A 0.007813758 +22 26369358 T C -0.00483665 +22 26390964 A G -0.007849451 +22 26415475 T C -0.001219281 +22 26456367 G A -0.01285326 +22 26460519 T C -0.008695338 +22 26528054 A G 0.01973023 +22 26617260 T A -0.01384025 +22 26638906 G T 0.01229772 +22 26735648 A G 0.0007879673 +22 26782251 G A 0.0005096459 +22 26812632 C T -0.01850814 +22 26939781 C T -0.0009222796 +22 26960648 A C -0.005679255 +22 27038865 T G -0.0001487706 +22 27042828 A G 0.02957737 +22 27161060 A G 0.002844558 +22 27191643 T C 0.008953731 +22 27216426 G A 0.00912099 +22 27217018 A G 0.01510616 +22 27240025 T G -0.0297174 +22 27242642 G A -0.009822927 +22 27246070 C T -0.001554199 +22 27252454 C T -0.006560251 +22 27264880 G T -0.01323094 +22 27337886 A G -0.009600014 +22 27339284 T C -0.009944488 +22 27353810 T C -0.002171555 +22 27370273 T C -0.009798478 +22 27378884 A G 0.05145072 +22 27398749 C T 0.001012263 +22 27403571 C T -0.01745865 +22 27405012 T C -0.005425419 +22 27415255 C T -0.01499362 +22 27426628 G C 0.0228946 +22 27430724 A G -0.007068064 +22 27435577 C T -0.008632412 +22 27487580 G A 0.003691502 +22 27498426 A G -0.006801544 +22 27526095 G A -0.0008086267 +22 27563274 C A 0.0136965 +22 27584680 A G -0.002139188 +22 27628151 C G 0.02130389 +22 27652290 T G 0.004815735 +22 27660675 A G 0.004899654 +22 27674832 G T 0.0001248065 +22 27718775 A G 0.02292384 +22 27729742 G A 0.004951261 +22 27762155 C T 0.00485666 +22 27781736 A C -0.008336242 +22 27829565 G A 0.00285409 +22 27832985 G C -0.01668955 +22 27836311 G A -0.00775625 +22 27839704 T C -0.02492106 +22 27864471 A C 0.00218995 +22 27873024 G A 0.002721729 +22 27883265 G A 0.02961735 +22 27890684 A G -0.008057355 +22 27927298 T C 0.02054268 +22 27934290 G A 0.004751755 +22 27951176 A G -0.0004329547 +22 27974819 C A 0.01439093 +22 27975451 G A -0.03648208 +22 28007741 C T -0.01635917 +22 28016883 C A 0.008564085 +22 28046561 T C 0.01535905 +22 28060034 A G 0.03097228 +22 28076058 C T 0.02848654 +22 28094845 G A -0.02659077 +22 28130130 C T -0.01640387 +22 28136977 A C -0.003962775 +22 28150109 G A 0.0006071392 +22 28150815 A G 0.01604724 +22 28151825 A G -0.005390282 +22 28155404 T C 0.005030388 +22 28172577 G T 0.005704168 +22 28185452 G T -0.006896853 +22 28200176 G A -0.006474674 +22 28206912 C A -0.006175542 +22 28270372 G T -0.0006768204 +22 28412908 G T 0.01763639 +22 28501414 T C -0.2304747 +22 29106733 C T -0.01074749 +22 29318724 T C 0.001743333 +22 29378610 C T 0.0006690876 +22 29478760 C T -0.03029428 +22 29533572 G C -0.01269604 +22 29626515 A G -0.0117113 +22 29630337 A G 0.02658049 +22 29669648 C G -0.008550535 +22 29692497 T G 0.001234896 +22 29837537 C T 0.01321112 +22 29961986 T G 0.001878853 +22 30151687 C T 0.003418302 +22 30163526 G A 0.01576261 +22 30494371 A G 0.007959801 +22 30592487 G C -0.1047403 +22 30621613 A C -0.01382104 +22 30658082 C T -0.03794014 +22 30688659 T C 0.0225714 +22 30762140 A G 0.02079806 +22 30793137 A G -0.004609306 +22 30901592 C T -0.00833404 +22 30927975 T C 0.003226189 +22 30953295 T C -0.00768579 +22 30992651 G A -0.025658 +22 31018975 C T 0.04241226 +22 31032920 G A -0.02311985 +22 31063804 G GT -0.0002081808 +22 31114086 G T 0.02825476 +22 31139653 A G 2.640129E-06 +22 31214382 G A 0.01137657 +22 31216506 C T 0.005531311 +22 31272930 T C -0.001056118 +22 31333631 C T -0.01235089 +22 31378447 A G 0.01020507 +22 31442308 A G -0.002479126 +22 31477361 C G -0.01263667 +22 31514348 G A 0.00580324 +22 31521404 A G 0.01097391 +22 31659495 C T 0.02663412 +22 31884405 C T -0.0003950834 +22 32200849 T C 0.01585735 +22 32341684 T C -0.02960328 +22 32559835 G A -0.02170436 +22 32569263 C T -0.001296006 +22 32624139 C T 0.005619574 +22 32702816 A G -0.01534023 +22 32756652 G A 0.02512177 +22 32831540 T C 0.001868495 +22 32832874 T C 6.028815E-05 +22 32853660 G A 0.0138221 +22 32854391 C A 0.0001960825 +22 32875190 A G -0.006426637 +22 32934713 C CT -0.009057754 +22 32952012 A C -0.00380248 +22 32954443 G A 0.002210369 +22 32993032 C T -0.002429979 +22 32997766 T C -0.008424246 +22 33045573 T C -0.03107145 +22 33046110 G C -0.06954732 +22 33048039 T C 0.01138346 +22 33056341 C T -0.06477198 +22 33108536 T C -0.03426392 +22 33108981 T C -0.07404035 +22 33116435 T C 0.06542471 +22 33143528 G A 0.02195059 +22 33146363 A G 0.000810539 +22 33259625 C T 0.02309793 +22 33336039 T G -0.02554387 +22 33408519 T C -0.0075563 +22 33660345 C G 0.002190743 +22 33804893 C T 0.006680774 +22 33844303 C T 0.008923314 +22 33846914 T C 0.006295378 +22 33898906 A C 1.958759E-05 +22 34022284 A G -0.00257933 +22 34137784 G A 0.004460828 +22 34208570 T C -0.003365869 +22 34217757 T C 0.009289431 +22 34256923 A C 0.01439384 +22 34265402 G A -0.0163661 +22 34284173 G A -0.02315559 +22 34296093 C A -0.004688326 +22 34378012 A G 0.002276664 +22 34436795 C T 0.0001337033 +22 34488452 A G -0.000428831 +22 34501541 A G 0.002763614 +22 34514810 C A 0.003976601 +22 34526428 C T 0.01088864 +22 34583078 A G 0.001802495 +22 34620754 T C 0.01466546 +22 34691035 A G -0.0002082615 +22 34758540 T C 0.005165532 +22 34851377 A C 0.0137118 +22 35371707 T C -0.0004985554 +22 35382268 A C -0.004931336 +22 35419122 C T -0.01077953 +22 35478529 A G 0.0001760523 +22 35481493 T C 0.01056439 +22 35526281 G A -0.002766891 +22 35603836 A G -0.0001783939 +22 35660875 T G 0.03988231 +22 35745196 G T 0.0001750545 +22 35750980 A G -0.007651136 +22 35783413 G A 0.001649791 +22 35918270 C T 0.006918713 +22 35959242 A G 0.01697538 +22 35962060 G A 0.005181476 +22 35964158 G C 0.002769931 +22 35984385 A G -0.01280623 +22 36001258 C T 0.01342405 +22 36072262 T C 0.00489549 +22 36180535 G A -0.03250252 +22 36517307 C T 0.01366076 +22 36519596 A C -0.00349956 +22 36532058 A G -0.01214487 +22 36543489 C G 0.007838149 +22 36600841 G A 0.02644389 +22 36629633 C A -0.006871468 +22 36635967 G A -0.02634742 +22 36655735 A G -0.005385142 +22 36661646 A G -0.01560741 +22 36684354 C T -0.005170111 +22 36705622 A G 0.01713234 +22 36708049 C CTCCTGTGA -0.05187051 +22 36751101 A C -0.0244065 +22 36764788 G A 0.02784116 +22 36897427 C T 0.02603792 +22 36900806 G A 0.007366207 +22 36923144 T C -0.001875563 +22 36924714 G A -0.003632594 +22 36946643 T G 0.01333137 +22 36954939 T C 0.01105894 +22 36998907 T C -0.0006084687 +22 37001495 G T -0.01224147 +22 37013167 G C 0.01866849 +22 37077364 C T 0.007294257 +22 37080738 C G -0.004873355 +22 37101890 C T 0.03991764 +22 37118535 A G -0.001713909 +22 37184521 G A 0.006515894 +22 37206341 G T 0.0002566936 +22 37256262 A G 0.001152626 +22 37258503 C T -0.009761102 +22 37323988 T C -0.0073182 +22 37329545 G A 0.005775806 +22 37337409 T C -0.02534399 +22 37343000 A C -0.0004011777 +22 37398195 T C -0.01001198 +22 37401532 A G -0.003244795 +22 37407109 C G 0.04335972 +22 37477732 T C 0.0003669548 +22 37507019 A G -0.0009259451 +22 37513316 A G 0.001153887 +22 37532441 A G 0.01802306 +22 37571497 G A -0.005785311 +22 37581383 T C 0.03172492 +22 37621269 C A 0.004460405 +22 37644621 T C -0.008386907 +22 37671896 A G 0.02303688 +22 37679763 G A -0.002658396 +22 37720268 G A 0.02120184 +22 37753256 C T 0.008984539 +22 37757099 G A -0.01560347 +22 37780522 C G -0.01496708 +22 37800175 T C -0.005510833 +22 37846448 G A 0.01152963 +22 37896749 C T 0.005447068 +22 37908435 C T 0.001909131 +22 37977481 T C 0.01465308 +22 37992699 G A 0.0008339179 +22 38032762 G GA 0.01693041 +22 38054262 C A 0.04354146 +22 38083101 C T -0.02092117 +22 38119213 A G 0.03948165 +22 38122122 C T 0.04377277 +22 38204089 T C 0.02977743 +22 38435786 T G -0.007684278 +22 38544298 G A 0.05090446 +22 38597378 T G -0.01997927 +22 38606780 G A -0.009182016 +22 38630272 C T 0.007393137 +22 38663819 G A -0.006392021 +22 38673234 A G -0.01106705 +22 38685131 C T -0.004493352 +22 38695406 T C -0.01155972 +22 38708506 A G 0.01701713 +22 38744184 C T -0.02112956 +22 38819613 A G -0.005625806 +22 38877461 G T 0.001108728 +22 38918894 G T -0.008094286 +22 38928269 G T -0.02114917 +22 39027286 C CAG 0.003840735 +22 39067524 G A 0.01200232 +22 39159201 C T 0.003096214 +22 39178701 G A 0.002148449 +22 39260032 T C 0.03574634 +22 39268785 T G 0.009377414 +22 39281774 G T 0.03816951 +22 39300265 C T 0.03540156 +22 39332623 T C -0.004449842 +22 39415780 G A 0.01479946 +22 39448465 A G 0.003065974 +22 39480697 G A -0.04005617 +22 39487665 G A -0.0001218988 +22 39493294 C T -0.03115929 +22 39510995 G A -0.02069106 +22 39542292 A G 0.009653575 +22 39543000 T C -0.004069841 +22 39573724 A C 0.02683694 +22 39575692 A C 0.01451305 +22 39581277 A C 0.01766406 +22 39626572 A G -0.02901981 +22 39658626 C T 0.004177065 +22 39665395 G A 0.01264611 +22 39687484 G A 0.005418141 +22 39708279 A G -0.04281532 +22 39708357 T C 0.008605574 +22 39793066 G T 0.03658209 +22 39798127 G A 0.002302129 +22 39843409 T C 0.01065699 +22 39865475 G A 0.001588501 +22 39932516 A G -0.01179841 +22 39963426 G A -0.01503908 +22 40023636 C T 0.006443146 +22 40046176 C T -0.0007416552 +22 40067818 T C 0.00455936 +22 40092864 G A 0.02400297 +22 40127293 T C -0.0008870038 +22 40358148 T C -0.01079902 +22 40420786 G C -0.008092115 +22 40454069 G T 0.00789888 +22 40541981 G A 0.0174264 +22 40652873 G A 0.005853057 +22 40676672 G T -0.001894274 +22 40729614 G A 0.0195994 +22 40820151 C T -0.01628066 +22 40986372 G C -0.01983507 +22 41494925 A G -0.02918069 +22 41646738 G A 0.0003521847 +22 41680898 T C 0.01402732 +22 41704872 T C 6.681484E-05 +22 41791536 C T -5.572333E-05 +22 41895409 A G -0.04407217 +22 41929175 G T -0.03186844 +22 42089623 C T 0.00532234 +22 42095658 G T 0.03846131 +22 42210985 C T -0.00313971 +22 42279653 G A -0.006596336 +22 42341308 G A -0.0006862491 +22 42524243 C CT -0.01181191 +22 42672124 G A -0.005278171 +22 42691238 T C -0.01642396 +22 42813753 C T -0.00386775 +22 42867898 G A -0.001352327 +22 42912097 T C -0.0007295657 +22 42932317 A G -0.05768556 +22 43010817 A G 0.01722077 +22 43080028 T C -0.0005527551 +22 43096507 T C -0.005556102 +22 43112475 T C -0.01350273 +22 43114824 G A -0.01963192 +22 43115576 C T -0.01880097 +22 43154299 G A -0.001621113 +22 43159948 T C -0.007980584 +22 43206950 C A -0.005783037 +22 43218397 C T -0.003976636 +22 43283255 C A -0.01426668 +22 43290583 C T -0.03955775 +22 43333156 A G -0.03127845 +22 43426262 G A -0.00366804 +22 43483242 T C -0.02540203 +22 43515108 C T -0.01570749 +22 43529314 C G 0.01738127 +22 43551513 G A 0.02565386 +22 43558972 A G -0.01962819 +22 43577214 T C -0.02270478 +22 43579049 C T -0.001193909 +22 43610207 G A -0.007621661 +22 43623395 G C -0.04852519 +22 43640512 C T -0.005533207 +22 43649701 C T 0.07724845 +22 43661080 T C -0.04251741 +22 43683088 A G -0.003582388 +22 43707996 A G -0.02547044 +22 43711080 C G -0.005784446 +22 43721519 C A 0.000365885 +22 43729401 C T 0.008557013 +22 43763757 T G -0.0178981 +22 43836198 G T 0.002427697 +22 43976396 A G -0.01277457 +22 44031042 C T 0.003593107 +22 44193626 C A -0.006865434 +22 44221247 G A 0.01833991 +22 44296372 T C 0.006169212 +22 44298838 A G 0.007441756 +22 44342116 G A 0.02810328 +22 44368122 G A 0.0129968 +22 44379838 G A 0.001648422 +22 44380033 C T -0.002136788 +22 44395451 C T -0.006698507 +22 44419871 C T 0.0181613 +22 44424108 T C 0.01036733 +22 44467899 C T -0.002592364 +22 44498134 T C 0.007281423 +22 44522312 C T -0.0002636447 +22 44526130 G A -0.00388298 +22 44530286 A G 0.02528159 +22 44530420 C T -0.01233654 +22 44548944 G A -0.003947209 +22 44551755 G A 0.01262458 +22 44566434 A G -0.004290306 +22 44581046 T C -0.0147995 +22 44643161 C T 0.01439493 +22 44677081 C T -0.01030513 +22 44681612 G A -0.001269762 +22 44695088 T C 0.006324859 +22 44707716 G T 0.002288939 +22 44725343 G A 0.003534678 +22 44738406 G A 0.02320049 +22 44746729 A G -0.01754216 +22 44751158 G A -0.006539695 +22 44757439 A G 0.02480295 +22 44759519 G A 0.002111274 +22 44761797 A T -0.00531172 +22 44763352 C G 0.01452737 +22 44783779 G A 0.009142699 +22 44791807 C T -0.02371876 +22 44818986 C T -0.006740622 +22 44894913 G A -5.179871E-05 +22 45058431 C T 0.01098259 +22 45066035 A G -0.01484374 +22 45069410 T C 0.01530441 +22 45081330 G A 0.00135012 +22 45082168 C A 0.003663354 +22 45090008 G A 0.002811861 +22 45116664 C T 0.01247728 +22 45244930 T C -0.01450041 +22 45258457 G A -0.003500519 +22 45323989 T C 0.001111338 +22 45415987 A G -0.01398184 +22 45451355 G A -0.005566982 +22 45471607 C T 0.01148978 +22 45497738 C T -0.005029327 +22 45502829 C T -0.03893521 +22 45519040 T G 0.002377071 +22 45523391 A G 0.01318997 +22 45573450 C A 0.0043856 +22 45589490 G A -0.008350439 +22 45668012 T C 0.01286879 +22 45671343 G A -2.940682E-06 +22 45672574 T C 0.005743608 +22 45693923 A G -0.002675069 +22 45718743 G A -0.02092804 +22 45723807 C G 0.001670159 +22 45728370 A G 0.0001879231 +22 45741537 G T 0.01420045 +22 45749983 T G -0.04591012 +22 45809624 A C 0.002185772 +22 45821935 A G 0.02250782 +22 45837410 G A -0.002756449 +22 45846371 T C 0.07910102 +22 45864934 T C 0.008535181 +22 45871507 G C -0.007764056 +22 45892656 G T -0.003885653 +22 45897997 C T 0.0003935204 +22 45929577 C T -0.02532217 +22 45936350 A G -0.008001698 +22 45942726 T G -0.01415551 +22 45996298 G A 0.05643525 +22 46009063 G A 0.006464843 +22 46022070 G A 0.0224674 +22 46155548 G C -0.0324747 +22 46207955 C T -0.001354554 +22 46236425 A G 0.08398423 +22 46275529 T C 0.0022643 +22 46287720 A G -0.02237482 +22 46289699 T C 0.01872124 +22 46303347 T C -0.01283734 +22 46316057 A G 0.02312579 +22 46337043 G C 0.01701173 +22 46347519 C T 0.01574289 +22 46364161 A G -0.04466341 +22 46381234 G A 0.04730559 +22 46396925 G A 0.001783944 +22 46403715 A G -0.02132589 +22 46406782 A C 0.08439466 +22 46445002 G C -0.07613496 +22 46458123 G T 0.03328073 +22 46482948 C T 0.04241879 +22 46486508 C T -0.00968439 +22 46493852 T C -0.00675858 +22 46499120 C G -0.009873118 +22 46502870 T C -0.0179214 +22 46561713 G A 0.02604703 +22 46586110 A G -0.001256735 +22 46592168 C T 0.01417055 +22 46614274 G C -0.05854014 +22 46627603 T C 0.08004024 +22 46760086 T C 0.003229515 +22 46782382 T C -0.02470821 +22 46807234 C T 0.002324176 +22 46837114 G A 0.000944073 +22 46888399 T C 0.009911095 +22 46907779 G A 0.00653144 +22 46909355 T G -0.004780494 +22 46914277 A C 0.009689535 +22 46943687 G A -0.0130366 +22 46985917 A G 0.01893397 +22 47021226 G A -0.01322949 +22 47095235 A C -0.1156013 +22 47109621 C T 0.0004322858 +22 47125474 G A -0.01746025 +22 47147117 T C -0.02418349 +22 47156703 C T 0.0262897 +22 47245836 A G 0.001880575 +22 47271747 C T 0.001055264 +22 47301822 C T 0.003032158 +22 47345487 T C -0.002945945 +22 47372368 T C 0.02067644 +22 47380606 C T 0.04041426 +22 47437808 C T 0.001683027 +22 47450911 A G 0.01624479 +22 47511864 A C -0.004226735 +22 47519476 T C -0.003954111 +22 47529458 A G -0.0003602848 +22 47531320 T C -0.006899703 +22 47548321 T C 0.004925401 +22 47568291 C T 0.007726693 +22 47571203 A G -0.009744751 +22 47574009 C T -0.00532701 +22 47642100 T C 0.006976251 +22 47657635 T C 0.001798943 +22 47683805 C T -0.03475544 +22 47720973 T C -0.007868172 +22 47821952 G A -0.000885428 +22 47893053 A G -0.02449056 +22 47935365 C T -0.001599879 +22 47961708 G T -0.003593525 +22 47986332 T C -0.003976592 +22 48154645 C T 0.007608639 +22 48165452 C CT 0.002039503 +22 48207318 T C -0.009725168 +22 48213904 G C -0.01220367 +22 48215904 A G -2.488244E-05 +22 48220460 T C -0.002702163 +22 48230941 C A -0.001129522 +22 48271961 A G -0.005053446 +22 48284025 T C -0.003344182 +22 48297953 C T -0.01046958 +22 48362290 G A -0.02367254 +22 48362914 C A -0.003167719 +22 48387670 A G -0.008243989 +22 48415446 C T 0.002130715 +22 48460730 T C 0.002682476 +22 48491160 T C 0.001257794 +22 48519794 C T 0.003680757 +22 48537775 G A 0.002134692 +22 48543566 T C 0.007314089 +22 48593037 C T 0.009084708 +22 48687509 C T -0.0277196 +22 48692033 T C -0.02126264 +22 48699617 T C 0.0005093107 +22 48717568 T C -0.0008190281 +22 48811946 C T 0.007916515 +22 48823357 G A 0.01464317 +22 48840428 A C 0.003711229 +22 48851612 T C -0.005887765 +22 48874310 T C -0.01106607 +22 48968070 C T 0.01280691 +22 48991385 T C -0.01234119 +22 49004050 G A 0.02290755 +22 49014565 A G 0.001555565 +22 49086481 T C -0.006196369 +22 49107173 T C 0.01277272 +22 49180915 A G 0.006346977 +22 49262579 A G 0.02657134 +22 49270317 C T 0.001447665 +22 49313196 A G -0.007055532 +22 49335230 T C -0.006548281 +22 49366123 T C 0.01136486 +22 49372356 G C -0.02420841 +22 49443666 T C 0.01581736 +22 49496835 G A -0.01355414 +22 49524428 A G -0.004228482 +22 49530553 G C 0.008197389 +22 49537845 T C 0.0111255 +22 49557457 G A 0.009401926 +22 49562666 C A 0.01271701 +22 49574509 C T 0.0004703177 +22 49579141 A G 0.02448619 +22 49650863 T C 0.006739571 +22 49662549 T G -0.005769464 +22 49665841 T C -0.0007037069 +22 49677464 A G -0.02177735 +22 49696067 C T -0.003309682 +22 49700272 T G -0.002541948 +22 49706433 T C -0.01719402 +22 49713835 G A -0.01370754 +22 49719264 A C -0.01067852 +22 49743627 G A -0.0005970581 +22 49800265 C T 0.03098582 +22 49806863 A G 0.003940447 +22 49830851 C T -0.002742706 +22 49834624 G A -0.002820163 +22 49843235 G C -0.0004458281 +22 49847501 T G 0.002235016 +22 49861033 C T 0.01721243 +22 49881321 A G -0.00051278 +22 49908804 G A -0.009455892 +22 49911222 G T -0.01389666 +22 49925268 A G 0.01679984 +22 49927332 T C 0.00039298 +22 50109212 T C 0.01610819 +22 50118149 G C 0.007024666 +22 50184484 G T 0.01222581 +22 50219447 T C 0.05091891 +22 50278568 G A -0.02340672 +22 50319170 G A 0.01669806 +22 50350971 A G 0.0264016 +22 50356693 C T 0.003851499 +22 50435480 G A 0.0166363 +22 50439626 A G -0.002722154 +22 50466542 C T -0.002560094 +22 50470516 T C -0.01621986 +22 50491150 G A 0.01828674 +22 50515270 C T 0.01439904 +22 50529850 C T 0.02054628 +22 50570755 C G 0.007077514 +22 50582626 G A -0.003588854 +22 50672154 A G 0.007660848 +22 50722134 C T -0.01747164 +22 50722408 C T -0.001063465 +22 50728062 C T 0.02159223 +22 50750481 T C 0.01877272 +22 50758873 T C 0.004001731 +22 50835040 A G -0.006374259 +22 50859049 C T 0.0003480749 +22 50885775 G A -0.01358311 +22 50926768 T C 0.001798498 +22 50928026 A G 0.004775504 +22 50971266 C T 0.02160893 +22 50989197 T C -0.01328884 +22 50989326 G A 0.01037054 +22 50999681 G A -0.01226224 +22 51046163 T C -0.02754002 +22 51117580 C T 0.03573542 +22 51171497 A G -0.01951606 +22 51174939 T C -0.006178519 diff --git a/tests/data/combine/scorefile_dominant_and_recessive.txt b/tests/data/combine/scorefile_dominant_and_recessive.txt new file mode 100644 index 0000000..bbf23f0 --- /dev/null +++ b/tests/data/combine/scorefile_dominant_and_recessive.txt @@ -0,0 +1,838 @@ +#pgs_name=PGS001229_22_DominantRecessiveExample +#genome_build=GRCh37 +chr_name chr_position effect_allele other_allele effect_weight is_dominant is_recessive +22 17080378 G A 0.01045457 TRUE FALSE +22 17300230 A G 0.0001411475 FALSE TRUE +22 17318864 A C 0.008166266 FALSE FALSE +22 17327595 T C 0.007791641 FALSE FALSE +22 17409813 A G 0.0003108784 FALSE FALSE +22 17450952 G A -0.03033983 FALSE FALSE +22 17492533 G A 0.00388999 FALSE FALSE +22 17542810 C T 0.00803629 FALSE FALSE +22 17565013 G A 0.02135621 FALSE FALSE +22 17589209 T C 0.003026491 FALSE FALSE +22 17600977 A G 0.01581277 FALSE FALSE +22 17625915 A G -0.1172964 FALSE FALSE +22 17630486 A C 0.01012909 FALSE FALSE +22 17633785 C T 0.0023255 FALSE FALSE +22 17643689 A G 0.003361814 FALSE FALSE +22 17669306 C T 0.0214506 FALSE FALSE +22 17677699 T C -0.0007031384 FALSE FALSE +22 17680519 C A 0.001079236 FALSE FALSE +22 17701234 G A 0.004477145 FALSE FALSE +22 17703119 A T 0.0007771872 FALSE FALSE +22 17718699 C A -0.01320632 FALSE FALSE +22 17721595 C T 0.009480363 FALSE FALSE +22 17727648 T C 0.007811685 FALSE FALSE +22 17738177 G A -0.004719812 FALSE FALSE +22 17749096 A G -0.005244795 FALSE FALSE +22 17770181 G T -0.03101703 FALSE FALSE +22 17793969 G A 0.01774444 FALSE FALSE +22 17815696 G C -0.00551609 FALSE FALSE +22 17827684 G A -0.005944752 FALSE FALSE +22 17831813 T C 0.01061587 FALSE FALSE +22 17844929 T G 0.001717643 FALSE FALSE +22 17850661 T C -0.02805489 FALSE FALSE +22 17887534 A G 0.0007723542 FALSE FALSE +22 17887725 A G 0.007472703 FALSE FALSE +22 17958221 C A -0.02098647 FALSE FALSE +22 18036253 G A -0.01772981 FALSE FALSE +22 18038786 A G -0.002119071 FALSE FALSE +22 18262301 A T -0.005065485 FALSE FALSE +22 18289204 A G 0.005306345 FALSE FALSE +22 18295575 C T 0.02300129 FALSE FALSE +22 18296238 G A -0.005665446 FALSE FALSE +22 18319179 T C 0.03440642 FALSE FALSE +22 18393534 A C 0.01764269 FALSE FALSE +22 18439958 T C -0.002261707 FALSE FALSE +22 18483388 G A 0.03318724 FALSE FALSE +22 18488883 C G -0.0191918 FALSE FALSE +22 18489048 C A 0.01233198 FALSE FALSE +22 18495470 A G -0.005804926 FALSE FALSE +22 18537145 G A -0.004930116 FALSE FALSE +22 18571008 A G -8.844726E-05 FALSE FALSE +22 18584433 C T -0.001169893 FALSE FALSE +22 18631365 T C -0.001551714 FALSE FALSE +22 18650682 T C -0.01313784 FALSE FALSE +22 18890037 A G 0.05968921 FALSE FALSE +22 18891398 G A 0.006891943 FALSE FALSE +22 18892575 A G -0.00224447 FALSE FALSE +22 18915963 A G 0.003719756 FALSE FALSE +22 18959581 T C 0.006464581 FALSE FALSE +22 18963340 A G -0.01397565 FALSE FALSE +22 18970915 T C -0.001507131 FALSE FALSE +22 19024651 T C -0.00350575 FALSE FALSE +22 19121872 A G 0.01644046 FALSE FALSE +22 19135603 A G -0.02970077 FALSE FALSE +22 19190143 T C 0.003268027 FALSE FALSE +22 19263698 T C 0.02057255 FALSE FALSE +22 19292446 G T 0.01153989 FALSE FALSE +22 19371052 T C 0.01055134 FALSE FALSE +22 19420109 C T -0.008628228 FALSE FALSE +22 19451186 A C 0.02141029 FALSE FALSE +22 19518079 C T 0.005372247 FALSE FALSE +22 19581331 T C 0.01686942 FALSE FALSE +22 19593854 C A 0.0006544249 FALSE FALSE +22 19606703 G A 0.02070121 FALSE FALSE +22 19649005 A G 0.002868601 FALSE FALSE +22 19735854 C T 0.006262962 FALSE FALSE +22 19738355 T C 4.97384E-05 FALSE FALSE +22 19770886 A G -0.01013929 FALSE FALSE +22 19781823 T C 0.02481609 FALSE FALSE +22 19873357 T C 0.0116302 FALSE FALSE +22 19907099 A C -0.0267645 FALSE FALSE +22 19968597 T C -0.02203945 FALSE FALSE +22 20046344 G A -0.009801428 FALSE FALSE +22 20084821 C T -0.02232886 FALSE FALSE +22 20185457 A G 0.006892171 FALSE FALSE +22 20189077 T C 0.01738215 FALSE FALSE +22 20219648 A G 0.009307625 FALSE FALSE +22 20248391 A G -0.005405845 FALSE FALSE +22 20267213 A G 0.006713242 FALSE FALSE +22 20286099 G T 0.01574758 FALSE FALSE +22 20749042 G A 0.006603339 FALSE FALSE +22 20754039 A G -0.01181141 FALSE FALSE +22 20775167 T C 0.01160113 FALSE FALSE +22 20780296 A G 0.06735311 FALSE FALSE +22 20789074 C T 0.02844307 FALSE FALSE +22 20791438 A C 0.0473474 FALSE FALSE +22 20793914 C T 0.007009781 FALSE FALSE +22 20839810 T G 0.003947346 FALSE FALSE +22 20860931 T C 0.0005613511 FALSE FALSE +22 20979980 G A 0.003231665 FALSE FALSE +22 20991771 G A 0.004226765 FALSE FALSE +22 21075537 C A -0.002096453 FALSE FALSE +22 21154393 G T -0.004297086 FALSE FALSE +22 21323357 C T -0.006041745 FALSE FALSE +22 21331918 G C -0.002280912 FALSE FALSE +22 21334924 C G -0.02031369 FALSE FALSE +22 21356824 A G 0.01476577 FALSE FALSE +22 21386019 A G 0.01435557 FALSE FALSE +22 21449028 G A -0.01537701 FALSE FALSE +22 21463515 A G -0.01335614 FALSE FALSE +22 21982892 T C -0.06373335 FALSE FALSE +22 22001704 T G 0.02809584 FALSE FALSE +22 22062480 T C 0.0529113 FALSE FALSE +22 22080735 G A -0.0139426 FALSE FALSE +22 22151939 C A -0.008287849 FALSE FALSE +22 22163425 G A 0.05518983 FALSE FALSE +22 22307519 C G -0.003486191 FALSE FALSE +22 22351283 G A -0.0007483763 FALSE FALSE +22 22394291 AG A 0.004320583 FALSE FALSE +22 22395754 T C 0.002587971 FALSE FALSE +22 22424302 A C 0.0011408 FALSE FALSE +22 22473905 C A 0.01226009 FALSE FALSE +22 22550450 G C 0.01773244 FALSE FALSE +22 22561610 C T -0.006207024 FALSE FALSE +22 22581369 G A -0.006272413 FALSE FALSE +22 22584678 A G -0.00217647 FALSE FALSE +22 22711786 T C 0.007779875 FALSE FALSE +22 22726372 T C 0.00349632 FALSE FALSE +22 22762771 C T 0.01252501 FALSE FALSE +22 22769923 G A -0.01103632 FALSE FALSE +22 22869742 A C -0.002412657 FALSE FALSE +22 22871922 A G -0.002769974 FALSE FALSE +22 22929268 T C -0.007035723 FALSE FALSE +22 23001481 A G 0.007524178 FALSE FALSE +22 23022520 T C 0.002175257 FALSE FALSE +22 23064982 A C -0.01255076 FALSE FALSE +22 23249440 A C 0.02085816 FALSE FALSE +22 23268677 A G 0.01337349 FALSE FALSE +22 23279456 C G -0.01371401 FALSE FALSE +22 23282286 C T 0.004994329 FALSE FALSE +22 23325722 C T 0.0008506657 FALSE FALSE +22 23412058 A G -0.009545553 FALSE FALSE +22 23627369 G A -0.01900175 FALSE FALSE +22 23644425 G A -0.0009106953 FALSE FALSE +22 23649242 G T 0.001061643 FALSE FALSE +22 23794844 G A -0.01198736 FALSE FALSE +22 23804670 G T -0.001119846 FALSE FALSE +22 23819697 T G -0.01028722 FALSE FALSE +22 23873076 T C 0.009509027 FALSE FALSE +22 23892145 T C 0.0135128 FALSE FALSE +22 23925779 C T -0.004127647 FALSE FALSE +22 23960187 T C -0.008475905 FALSE FALSE +22 24035970 T C -0.001334318 FALSE FALSE +22 24086107 G A -0.01652957 FALSE FALSE +22 24105789 A G 0.01813091 FALSE FALSE +22 24106448 A G 0.001834095 FALSE FALSE +22 24186809 C T -0.01426541 FALSE FALSE +22 24235360 G A 0.0003168635 FALSE FALSE +22 24255296 T C 0.01624252 FALSE FALSE +22 24300540 T C -0.00322576 FALSE FALSE +22 24376584 A G -0.006223068 FALSE FALSE +22 24406778 A C 0.00304654 FALSE FALSE +22 24618331 G A -0.0006506681 FALSE FALSE +22 24802564 A G -0.006695797 FALSE FALSE +22 24912232 T C -0.01536303 FALSE FALSE +22 24943582 A G -0.001687764 FALSE FALSE +22 24995668 G A -0.03537331 FALSE FALSE +22 25123505 C T -0.0160099 FALSE FALSE +22 25145094 T C -0.005584047 FALSE FALSE +22 25145453 T C -0.001388536 FALSE FALSE +22 25185823 A G -0.009228375 FALSE FALSE +22 25265972 A G 0.01088906 FALSE FALSE +22 25309448 A G -0.002238693 FALSE FALSE +22 25363411 A G 0.004035775 FALSE FALSE +22 25410895 G A 0.0009720734 FALSE FALSE +22 25442369 C T 0.01660527 FALSE FALSE +22 25454658 C A 0.01200285 FALSE FALSE +22 25465065 C T 0.01320801 FALSE FALSE +22 25524916 C T 0.01147501 FALSE FALSE +22 25603008 T C -0.01262741 FALSE FALSE +22 25619025 G T -0.01212511 FALSE FALSE +22 25621591 T C 0.01051851 FALSE FALSE +22 25643483 T G 0.01373474 FALSE FALSE +22 25661725 A G -0.005936431 FALSE FALSE +22 25667883 G A 0.01547775 FALSE FALSE +22 25668730 A C 0.02616493 FALSE FALSE +22 25678577 T C 0.0304018 FALSE FALSE +22 25761309 T C -0.001760112 FALSE FALSE +22 25761936 T C -0.005171998 FALSE FALSE +22 25938977 T C 0.01966116 FALSE FALSE +22 25994013 A G 0.0006268228 FALSE FALSE +22 26081873 T C 0.05232603 FALSE FALSE +22 26132612 A G -0.006457239 FALSE FALSE +22 26133775 T C -0.001181527 FALSE FALSE +22 26159289 A G -0.008399401 FALSE FALSE +22 26181767 C T 0.01044769 FALSE FALSE +22 26190915 G A 0.004287533 FALSE FALSE +22 26218164 G A -0.002803502 FALSE FALSE +22 26231312 C G 0.006105629 FALSE FALSE +22 26237826 C T 0.004981479 FALSE FALSE +22 26239850 A C 0.004144037 FALSE FALSE +22 26273893 C G 0.005616213 FALSE FALSE +22 26278128 G T -0.003965338 FALSE FALSE +22 26280462 T C -0.0008324497 FALSE FALSE +22 26290588 T C -0.0130732 FALSE FALSE +22 26292659 G A 4.294309E-05 FALSE FALSE +22 26343593 G A 0.007813758 FALSE FALSE +22 26369358 T C -0.00483665 FALSE FALSE +22 26390964 A G -0.007849451 FALSE FALSE +22 26415475 T C -0.001219281 FALSE FALSE +22 26456367 G A -0.01285326 FALSE FALSE +22 26460519 T C -0.008695338 FALSE FALSE +22 26528054 A G 0.01973023 FALSE FALSE +22 26617260 T A -0.01384025 FALSE FALSE +22 26638906 G T 0.01229772 FALSE FALSE +22 26735648 A G 0.0007879673 FALSE FALSE +22 26782251 G A 0.0005096459 FALSE FALSE +22 26812632 C T -0.01850814 FALSE FALSE +22 26939781 C T -0.0009222796 FALSE FALSE +22 26960648 A C -0.005679255 FALSE FALSE +22 27038865 T G -0.0001487706 FALSE FALSE +22 27042828 A G 0.02957737 FALSE FALSE +22 27161060 A G 0.002844558 FALSE FALSE +22 27191643 T C 0.008953731 FALSE FALSE +22 27216426 G A 0.00912099 FALSE FALSE +22 27217018 A G 0.01510616 FALSE FALSE +22 27240025 T G -0.0297174 FALSE FALSE +22 27242642 G A -0.009822927 FALSE FALSE +22 27246070 C T -0.001554199 FALSE FALSE +22 27252454 C T -0.006560251 FALSE FALSE +22 27264880 G T -0.01323094 FALSE FALSE +22 27337886 A G -0.009600014 FALSE FALSE +22 27339284 T C -0.009944488 FALSE FALSE +22 27353810 T C -0.002171555 FALSE FALSE +22 27370273 T C -0.009798478 FALSE FALSE +22 27378884 A G 0.05145072 FALSE FALSE +22 27398749 C T 0.001012263 FALSE FALSE +22 27403571 C T -0.01745865 FALSE FALSE +22 27405012 T C -0.005425419 FALSE FALSE +22 27415255 C T -0.01499362 FALSE FALSE +22 27426628 G C 0.0228946 FALSE FALSE +22 27430724 A G -0.007068064 FALSE FALSE +22 27435577 C T -0.008632412 FALSE FALSE +22 27487580 G A 0.003691502 FALSE FALSE +22 27498426 A G -0.006801544 FALSE FALSE +22 27526095 G A -0.0008086267 FALSE FALSE +22 27563274 C A 0.0136965 FALSE FALSE +22 27584680 A G -0.002139188 FALSE FALSE +22 27628151 C G 0.02130389 FALSE FALSE +22 27652290 T G 0.004815735 FALSE FALSE +22 27660675 A G 0.004899654 FALSE FALSE +22 27674832 G T 0.0001248065 FALSE FALSE +22 27718775 A G 0.02292384 FALSE FALSE +22 27729742 G A 0.004951261 FALSE FALSE +22 27762155 C T 0.00485666 FALSE FALSE +22 27781736 A C -0.008336242 FALSE FALSE +22 27829565 G A 0.00285409 FALSE FALSE +22 27832985 G C -0.01668955 FALSE FALSE +22 27836311 G A -0.00775625 FALSE FALSE +22 27839704 T C -0.02492106 FALSE FALSE +22 27864471 A C 0.00218995 FALSE FALSE +22 27873024 G A 0.002721729 FALSE FALSE +22 27883265 G A 0.02961735 FALSE FALSE +22 27890684 A G -0.008057355 FALSE FALSE +22 27927298 T C 0.02054268 FALSE FALSE +22 27934290 G A 0.004751755 FALSE FALSE +22 27951176 A G -0.0004329547 FALSE FALSE +22 27974819 C A 0.01439093 FALSE FALSE +22 27975451 G A -0.03648208 FALSE FALSE +22 28007741 C T -0.01635917 FALSE FALSE +22 28016883 C A 0.008564085 FALSE FALSE +22 28046561 T C 0.01535905 FALSE FALSE +22 28060034 A G 0.03097228 FALSE FALSE +22 28076058 C T 0.02848654 FALSE FALSE +22 28094845 G A -0.02659077 FALSE FALSE +22 28130130 C T -0.01640387 FALSE FALSE +22 28136977 A C -0.003962775 FALSE FALSE +22 28150109 G A 0.0006071392 FALSE FALSE +22 28150815 A G 0.01604724 FALSE FALSE +22 28151825 A G -0.005390282 FALSE FALSE +22 28155404 T C 0.005030388 FALSE FALSE +22 28172577 G T 0.005704168 FALSE FALSE +22 28185452 G T -0.006896853 FALSE FALSE +22 28200176 G A -0.006474674 FALSE FALSE +22 28206912 C A -0.006175542 FALSE FALSE +22 28270372 G T -0.0006768204 FALSE FALSE +22 28412908 G T 0.01763639 FALSE FALSE +22 28501414 T C -0.2304747 FALSE FALSE +22 29106733 C T -0.01074749 FALSE FALSE +22 29318724 T C 0.001743333 FALSE FALSE +22 29378610 C T 0.0006690876 FALSE FALSE +22 29478760 C T -0.03029428 FALSE FALSE +22 29533572 G C -0.01269604 FALSE FALSE +22 29626515 A G -0.0117113 FALSE FALSE +22 29630337 A G 0.02658049 FALSE FALSE +22 29669648 C G -0.008550535 FALSE FALSE +22 29692497 T G 0.001234896 FALSE FALSE +22 29837537 C T 0.01321112 FALSE FALSE +22 29961986 T G 0.001878853 FALSE FALSE +22 30151687 C T 0.003418302 FALSE FALSE +22 30163526 G A 0.01576261 FALSE FALSE +22 30494371 A G 0.007959801 FALSE FALSE +22 30592487 G C -0.1047403 FALSE FALSE +22 30621613 A C -0.01382104 FALSE FALSE +22 30658082 C T -0.03794014 FALSE FALSE +22 30688659 T C 0.0225714 FALSE FALSE +22 30762140 A G 0.02079806 FALSE FALSE +22 30793137 A G -0.004609306 FALSE FALSE +22 30901592 C T -0.00833404 FALSE FALSE +22 30927975 T C 0.003226189 FALSE FALSE +22 30953295 T C -0.00768579 FALSE FALSE +22 30992651 G A -0.025658 FALSE FALSE +22 31018975 C T 0.04241226 FALSE FALSE +22 31032920 G A -0.02311985 FALSE FALSE +22 31063804 G GT -0.0002081808 FALSE FALSE +22 31114086 G T 0.02825476 FALSE FALSE +22 31139653 A G 2.640129E-06 FALSE FALSE +22 31214382 G A 0.01137657 FALSE FALSE +22 31216506 C T 0.005531311 FALSE FALSE +22 31272930 T C -0.001056118 FALSE FALSE +22 31333631 C T -0.01235089 FALSE FALSE +22 31378447 A G 0.01020507 FALSE FALSE +22 31442308 A G -0.002479126 FALSE FALSE +22 31477361 C G -0.01263667 FALSE FALSE +22 31514348 G A 0.00580324 FALSE FALSE +22 31521404 A G 0.01097391 FALSE FALSE +22 31659495 C T 0.02663412 FALSE FALSE +22 31884405 C T -0.0003950834 FALSE FALSE +22 32200849 T C 0.01585735 FALSE FALSE +22 32341684 T C -0.02960328 FALSE FALSE +22 32559835 G A -0.02170436 FALSE FALSE +22 32569263 C T -0.001296006 FALSE FALSE +22 32624139 C T 0.005619574 FALSE FALSE +22 32702816 A G -0.01534023 FALSE FALSE +22 32756652 G A 0.02512177 FALSE FALSE +22 32831540 T C 0.001868495 FALSE FALSE +22 32832874 T C 6.028815E-05 FALSE FALSE +22 32853660 G A 0.0138221 FALSE FALSE +22 32854391 C A 0.0001960825 FALSE FALSE +22 32875190 A G -0.006426637 FALSE FALSE +22 32934713 C CT -0.009057754 FALSE FALSE +22 32952012 A C -0.00380248 FALSE FALSE +22 32954443 G A 0.002210369 FALSE FALSE +22 32993032 C T -0.002429979 FALSE FALSE +22 32997766 T C -0.008424246 FALSE FALSE +22 33045573 T C -0.03107145 FALSE FALSE +22 33046110 G C -0.06954732 FALSE FALSE +22 33048039 T C 0.01138346 FALSE FALSE +22 33056341 C T -0.06477198 FALSE FALSE +22 33108536 T C -0.03426392 FALSE FALSE +22 33108981 T C -0.07404035 FALSE FALSE +22 33116435 T C 0.06542471 FALSE FALSE +22 33143528 G A 0.02195059 FALSE FALSE +22 33146363 A G 0.000810539 FALSE FALSE +22 33259625 C T 0.02309793 FALSE FALSE +22 33336039 T G -0.02554387 FALSE FALSE +22 33408519 T C -0.0075563 FALSE FALSE +22 33660345 C G 0.002190743 FALSE FALSE +22 33804893 C T 0.006680774 FALSE FALSE +22 33844303 C T 0.008923314 FALSE FALSE +22 33846914 T C 0.006295378 FALSE FALSE +22 33898906 A C 1.958759E-05 FALSE FALSE +22 34022284 A G -0.00257933 FALSE FALSE +22 34137784 G A 0.004460828 FALSE FALSE +22 34208570 T C -0.003365869 FALSE FALSE +22 34217757 T C 0.009289431 FALSE FALSE +22 34256923 A C 0.01439384 FALSE FALSE +22 34265402 G A -0.0163661 FALSE FALSE +22 34284173 G A -0.02315559 FALSE FALSE +22 34296093 C A -0.004688326 FALSE FALSE +22 34378012 A G 0.002276664 FALSE FALSE +22 34436795 C T 0.0001337033 FALSE FALSE +22 34488452 A G -0.000428831 FALSE FALSE +22 34501541 A G 0.002763614 FALSE FALSE +22 34514810 C A 0.003976601 FALSE FALSE +22 34526428 C T 0.01088864 FALSE FALSE +22 34583078 A G 0.001802495 FALSE FALSE +22 34620754 T C 0.01466546 FALSE FALSE +22 34691035 A G -0.0002082615 FALSE FALSE +22 34758540 T C 0.005165532 FALSE FALSE +22 34851377 A C 0.0137118 FALSE FALSE +22 35371707 T C -0.0004985554 FALSE FALSE +22 35382268 A C -0.004931336 FALSE FALSE +22 35419122 C T -0.01077953 FALSE FALSE +22 35478529 A G 0.0001760523 FALSE FALSE +22 35481493 T C 0.01056439 FALSE FALSE +22 35526281 G A -0.002766891 FALSE FALSE +22 35603836 A G -0.0001783939 FALSE FALSE +22 35660875 T G 0.03988231 FALSE FALSE +22 35745196 G T 0.0001750545 FALSE FALSE +22 35750980 A G -0.007651136 FALSE FALSE +22 35783413 G A 0.001649791 FALSE FALSE +22 35918270 C T 0.006918713 FALSE FALSE +22 35959242 A G 0.01697538 FALSE FALSE +22 35962060 G A 0.005181476 FALSE FALSE +22 35964158 G C 0.002769931 FALSE FALSE +22 35984385 A G -0.01280623 FALSE FALSE +22 36001258 C T 0.01342405 FALSE FALSE +22 36072262 T C 0.00489549 FALSE FALSE +22 36180535 G A -0.03250252 FALSE FALSE +22 36517307 C T 0.01366076 FALSE FALSE +22 36519596 A C -0.00349956 FALSE FALSE +22 36532058 A G -0.01214487 FALSE FALSE +22 36543489 C G 0.007838149 FALSE FALSE +22 36600841 G A 0.02644389 FALSE FALSE +22 36629633 C A -0.006871468 FALSE FALSE +22 36635967 G A -0.02634742 FALSE FALSE +22 36655735 A G -0.005385142 FALSE FALSE +22 36661646 A G -0.01560741 FALSE FALSE +22 36684354 C T -0.005170111 FALSE FALSE +22 36705622 A G 0.01713234 FALSE FALSE +22 36708049 C CTCCTGTGA -0.05187051 FALSE FALSE +22 36751101 A C -0.0244065 FALSE FALSE +22 36764788 G A 0.02784116 FALSE FALSE +22 36897427 C T 0.02603792 FALSE FALSE +22 36900806 G A 0.007366207 FALSE FALSE +22 36923144 T C -0.001875563 FALSE FALSE +22 36924714 G A -0.003632594 FALSE FALSE +22 36946643 T G 0.01333137 FALSE FALSE +22 36954939 T C 0.01105894 FALSE FALSE +22 36998907 T C -0.0006084687 FALSE FALSE +22 37001495 G T -0.01224147 FALSE FALSE +22 37013167 G C 0.01866849 FALSE FALSE +22 37077364 C T 0.007294257 FALSE FALSE +22 37080738 C G -0.004873355 FALSE FALSE +22 37101890 C T 0.03991764 FALSE FALSE +22 37118535 A G -0.001713909 FALSE FALSE +22 37184521 G A 0.006515894 FALSE FALSE +22 37206341 G T 0.0002566936 FALSE FALSE +22 37256262 A G 0.001152626 FALSE FALSE +22 37258503 C T -0.009761102 FALSE FALSE +22 37323988 T C -0.0073182 FALSE FALSE +22 37329545 G A 0.005775806 FALSE FALSE +22 37337409 T C -0.02534399 FALSE FALSE +22 37343000 A C -0.0004011777 FALSE FALSE +22 37398195 T C -0.01001198 FALSE FALSE +22 37401532 A G -0.003244795 FALSE FALSE +22 37407109 C G 0.04335972 FALSE FALSE +22 37477732 T C 0.0003669548 FALSE FALSE +22 37507019 A G -0.0009259451 FALSE FALSE +22 37513316 A G 0.001153887 FALSE FALSE +22 37532441 A G 0.01802306 FALSE FALSE +22 37571497 G A -0.005785311 FALSE FALSE +22 37581383 T C 0.03172492 FALSE FALSE +22 37621269 C A 0.004460405 FALSE FALSE +22 37644621 T C -0.008386907 FALSE FALSE +22 37671896 A G 0.02303688 FALSE FALSE +22 37679763 G A -0.002658396 FALSE FALSE +22 37720268 G A 0.02120184 FALSE FALSE +22 37753256 C T 0.008984539 FALSE FALSE +22 37757099 G A -0.01560347 FALSE FALSE +22 37780522 C G -0.01496708 FALSE FALSE +22 37800175 T C -0.005510833 FALSE FALSE +22 37846448 G A 0.01152963 FALSE FALSE +22 37896749 C T 0.005447068 FALSE FALSE +22 37908435 C T 0.001909131 FALSE FALSE +22 37977481 T C 0.01465308 FALSE FALSE +22 37992699 G A 0.0008339179 FALSE FALSE +22 38032762 G GA 0.01693041 FALSE FALSE +22 38054262 C A 0.04354146 FALSE FALSE +22 38083101 C T -0.02092117 FALSE FALSE +22 38119213 A G 0.03948165 FALSE FALSE +22 38122122 C T 0.04377277 FALSE FALSE +22 38204089 T C 0.02977743 FALSE FALSE +22 38435786 T G -0.007684278 FALSE FALSE +22 38544298 G A 0.05090446 FALSE FALSE +22 38597378 T G -0.01997927 FALSE FALSE +22 38606780 G A -0.009182016 FALSE FALSE +22 38630272 C T 0.007393137 FALSE FALSE +22 38663819 G A -0.006392021 FALSE FALSE +22 38673234 A G -0.01106705 FALSE FALSE +22 38685131 C T -0.004493352 FALSE FALSE +22 38695406 T C -0.01155972 FALSE FALSE +22 38708506 A G 0.01701713 FALSE FALSE +22 38744184 C T -0.02112956 FALSE FALSE +22 38819613 A G -0.005625806 FALSE FALSE +22 38877461 G T 0.001108728 FALSE FALSE +22 38918894 G T -0.008094286 FALSE FALSE +22 38928269 G T -0.02114917 FALSE FALSE +22 39027286 C CAG 0.003840735 FALSE FALSE +22 39067524 G A 0.01200232 FALSE FALSE +22 39159201 C T 0.003096214 FALSE FALSE +22 39178701 G A 0.002148449 FALSE FALSE +22 39260032 T C 0.03574634 FALSE FALSE +22 39268785 T G 0.009377414 FALSE FALSE +22 39281774 G T 0.03816951 FALSE FALSE +22 39300265 C T 0.03540156 FALSE FALSE +22 39332623 T C -0.004449842 FALSE FALSE +22 39415780 G A 0.01479946 FALSE FALSE +22 39448465 A G 0.003065974 FALSE FALSE +22 39480697 G A -0.04005617 FALSE FALSE +22 39487665 G A -0.0001218988 FALSE FALSE +22 39493294 C T -0.03115929 FALSE FALSE +22 39510995 G A -0.02069106 FALSE FALSE +22 39542292 A G 0.009653575 FALSE FALSE +22 39543000 T C -0.004069841 FALSE FALSE +22 39573724 A C 0.02683694 FALSE FALSE +22 39575692 A C 0.01451305 FALSE FALSE +22 39581277 A C 0.01766406 FALSE FALSE +22 39626572 A G -0.02901981 FALSE FALSE +22 39658626 C T 0.004177065 FALSE FALSE +22 39665395 G A 0.01264611 FALSE FALSE +22 39687484 G A 0.005418141 FALSE FALSE +22 39708279 A G -0.04281532 FALSE FALSE +22 39708357 T C 0.008605574 FALSE FALSE +22 39793066 G T 0.03658209 FALSE FALSE +22 39798127 G A 0.002302129 FALSE FALSE +22 39843409 T C 0.01065699 FALSE FALSE +22 39865475 G A 0.001588501 FALSE FALSE +22 39932516 A G -0.01179841 FALSE FALSE +22 39963426 G A -0.01503908 FALSE FALSE +22 40023636 C T 0.006443146 FALSE FALSE +22 40046176 C T -0.0007416552 FALSE FALSE +22 40067818 T C 0.00455936 FALSE FALSE +22 40092864 G A 0.02400297 FALSE FALSE +22 40127293 T C -0.0008870038 FALSE FALSE +22 40358148 T C -0.01079902 FALSE FALSE +22 40420786 G C -0.008092115 FALSE FALSE +22 40454069 G T 0.00789888 FALSE FALSE +22 40541981 G A 0.0174264 FALSE FALSE +22 40652873 G A 0.005853057 FALSE FALSE +22 40676672 G T -0.001894274 FALSE FALSE +22 40729614 G A 0.0195994 FALSE FALSE +22 40820151 C T -0.01628066 FALSE FALSE +22 40986372 G C -0.01983507 FALSE FALSE +22 41494925 A G -0.02918069 FALSE FALSE +22 41646738 G A 0.0003521847 FALSE FALSE +22 41680898 T C 0.01402732 FALSE FALSE +22 41704872 T C 6.681484E-05 FALSE FALSE +22 41791536 C T -5.572333E-05 FALSE FALSE +22 41895409 A G -0.04407217 FALSE FALSE +22 41929175 G T -0.03186844 FALSE FALSE +22 42089623 C T 0.00532234 FALSE FALSE +22 42095658 G T 0.03846131 FALSE FALSE +22 42210985 C T -0.00313971 FALSE FALSE +22 42279653 G A -0.006596336 FALSE FALSE +22 42341308 G A -0.0006862491 FALSE FALSE +22 42524243 C CT -0.01181191 FALSE FALSE +22 42672124 G A -0.005278171 FALSE FALSE +22 42691238 T C -0.01642396 FALSE FALSE +22 42813753 C T -0.00386775 FALSE FALSE +22 42867898 G A -0.001352327 FALSE FALSE +22 42912097 T C -0.0007295657 FALSE FALSE +22 42932317 A G -0.05768556 FALSE FALSE +22 43010817 A G 0.01722077 FALSE FALSE +22 43080028 T C -0.0005527551 FALSE FALSE +22 43096507 T C -0.005556102 FALSE FALSE +22 43112475 T C -0.01350273 FALSE FALSE +22 43114824 G A -0.01963192 FALSE FALSE +22 43115576 C T -0.01880097 FALSE FALSE +22 43154299 G A -0.001621113 FALSE FALSE +22 43159948 T C -0.007980584 FALSE FALSE +22 43206950 C A -0.005783037 FALSE FALSE +22 43218397 C T -0.003976636 FALSE FALSE +22 43283255 C A -0.01426668 FALSE FALSE +22 43290583 C T -0.03955775 FALSE FALSE +22 43333156 A G -0.03127845 FALSE FALSE +22 43426262 G A -0.00366804 FALSE FALSE +22 43483242 T C -0.02540203 FALSE FALSE +22 43515108 C T -0.01570749 FALSE FALSE +22 43529314 C G 0.01738127 FALSE FALSE +22 43551513 G A 0.02565386 FALSE FALSE +22 43558972 A G -0.01962819 FALSE FALSE +22 43577214 T C -0.02270478 FALSE FALSE +22 43579049 C T -0.001193909 FALSE FALSE +22 43610207 G A -0.007621661 FALSE FALSE +22 43623395 G C -0.04852519 FALSE FALSE +22 43640512 C T -0.005533207 FALSE FALSE +22 43649701 C T 0.07724845 FALSE FALSE +22 43661080 T C -0.04251741 FALSE FALSE +22 43683088 A G -0.003582388 FALSE FALSE +22 43707996 A G -0.02547044 FALSE FALSE +22 43711080 C G -0.005784446 FALSE FALSE +22 43721519 C A 0.000365885 FALSE FALSE +22 43729401 C T 0.008557013 FALSE FALSE +22 43763757 T G -0.0178981 FALSE FALSE +22 43836198 G T 0.002427697 FALSE FALSE +22 43976396 A G -0.01277457 FALSE FALSE +22 44031042 C T 0.003593107 FALSE FALSE +22 44193626 C A -0.006865434 FALSE FALSE +22 44221247 G A 0.01833991 FALSE FALSE +22 44296372 T C 0.006169212 FALSE FALSE +22 44298838 A G 0.007441756 FALSE FALSE +22 44342116 G A 0.02810328 FALSE FALSE +22 44368122 G A 0.0129968 FALSE FALSE +22 44379838 G A 0.001648422 FALSE FALSE +22 44380033 C T -0.002136788 FALSE FALSE +22 44395451 C T -0.006698507 FALSE FALSE +22 44419871 C T 0.0181613 FALSE FALSE +22 44424108 T C 0.01036733 FALSE FALSE +22 44467899 C T -0.002592364 FALSE FALSE +22 44498134 T C 0.007281423 FALSE FALSE +22 44522312 C T -0.0002636447 FALSE FALSE +22 44526130 G A -0.00388298 FALSE FALSE +22 44530286 A G 0.02528159 FALSE FALSE +22 44530420 C T -0.01233654 FALSE FALSE +22 44548944 G A -0.003947209 FALSE FALSE +22 44551755 G A 0.01262458 FALSE FALSE +22 44566434 A G -0.004290306 FALSE FALSE +22 44581046 T C -0.0147995 FALSE FALSE +22 44643161 C T 0.01439493 FALSE FALSE +22 44677081 C T -0.01030513 FALSE FALSE +22 44681612 G A -0.001269762 FALSE FALSE +22 44695088 T C 0.006324859 FALSE FALSE +22 44707716 G T 0.002288939 FALSE FALSE +22 44725343 G A 0.003534678 FALSE FALSE +22 44738406 G A 0.02320049 FALSE FALSE +22 44746729 A G -0.01754216 FALSE FALSE +22 44751158 G A -0.006539695 FALSE FALSE +22 44757439 A G 0.02480295 FALSE FALSE +22 44759519 G A 0.002111274 FALSE FALSE +22 44761797 A T -0.00531172 FALSE FALSE +22 44763352 C G 0.01452737 FALSE FALSE +22 44783779 G A 0.009142699 FALSE FALSE +22 44791807 C T -0.02371876 FALSE FALSE +22 44818986 C T -0.006740622 FALSE FALSE +22 44894913 G A -5.179871E-05 FALSE FALSE +22 45058431 C T 0.01098259 FALSE FALSE +22 45066035 A G -0.01484374 FALSE FALSE +22 45069410 T C 0.01530441 FALSE FALSE +22 45081330 G A 0.00135012 FALSE FALSE +22 45082168 C A 0.003663354 FALSE FALSE +22 45090008 G A 0.002811861 FALSE FALSE +22 45116664 C T 0.01247728 FALSE FALSE +22 45244930 T C -0.01450041 FALSE FALSE +22 45258457 G A -0.003500519 FALSE FALSE +22 45323989 T C 0.001111338 FALSE FALSE +22 45415987 A G -0.01398184 FALSE FALSE +22 45451355 G A -0.005566982 FALSE FALSE +22 45471607 C T 0.01148978 FALSE FALSE +22 45497738 C T -0.005029327 FALSE FALSE +22 45502829 C T -0.03893521 FALSE FALSE +22 45519040 T G 0.002377071 FALSE FALSE +22 45523391 A G 0.01318997 FALSE FALSE +22 45573450 C A 0.0043856 FALSE FALSE +22 45589490 G A -0.008350439 FALSE FALSE +22 45668012 T C 0.01286879 FALSE FALSE +22 45671343 G A -2.940682E-06 FALSE FALSE +22 45672574 T C 0.005743608 FALSE FALSE +22 45693923 A G -0.002675069 FALSE FALSE +22 45718743 G A -0.02092804 FALSE FALSE +22 45723807 C G 0.001670159 FALSE FALSE +22 45728370 A G 0.0001879231 FALSE FALSE +22 45741537 G T 0.01420045 FALSE FALSE +22 45749983 T G -0.04591012 FALSE FALSE +22 45809624 A C 0.002185772 FALSE FALSE +22 45821935 A G 0.02250782 FALSE FALSE +22 45837410 G A -0.002756449 FALSE FALSE +22 45846371 T C 0.07910102 FALSE FALSE +22 45864934 T C 0.008535181 FALSE FALSE +22 45871507 G C -0.007764056 FALSE FALSE +22 45892656 G T -0.003885653 FALSE FALSE +22 45897997 C T 0.0003935204 FALSE FALSE +22 45929577 C T -0.02532217 FALSE FALSE +22 45936350 A G -0.008001698 FALSE FALSE +22 45942726 T G -0.01415551 FALSE FALSE +22 45996298 G A 0.05643525 FALSE FALSE +22 46009063 G A 0.006464843 FALSE FALSE +22 46022070 G A 0.0224674 FALSE FALSE +22 46155548 G C -0.0324747 FALSE FALSE +22 46207955 C T -0.001354554 FALSE FALSE +22 46236425 A G 0.08398423 FALSE FALSE +22 46275529 T C 0.0022643 FALSE FALSE +22 46287720 A G -0.02237482 FALSE FALSE +22 46289699 T C 0.01872124 FALSE FALSE +22 46303347 T C -0.01283734 FALSE FALSE +22 46316057 A G 0.02312579 FALSE FALSE +22 46337043 G C 0.01701173 FALSE FALSE +22 46347519 C T 0.01574289 FALSE FALSE +22 46364161 A G -0.04466341 FALSE FALSE +22 46381234 G A 0.04730559 FALSE FALSE +22 46396925 G A 0.001783944 FALSE FALSE +22 46403715 A G -0.02132589 FALSE FALSE +22 46406782 A C 0.08439466 FALSE FALSE +22 46445002 G C -0.07613496 FALSE FALSE +22 46458123 G T 0.03328073 FALSE FALSE +22 46482948 C T 0.04241879 FALSE FALSE +22 46486508 C T -0.00968439 FALSE FALSE +22 46493852 T C -0.00675858 FALSE FALSE +22 46499120 C G -0.009873118 FALSE FALSE +22 46502870 T C -0.0179214 FALSE FALSE +22 46561713 G A 0.02604703 FALSE FALSE +22 46586110 A G -0.001256735 FALSE FALSE +22 46592168 C T 0.01417055 FALSE FALSE +22 46614274 G C -0.05854014 FALSE FALSE +22 46627603 T C 0.08004024 FALSE FALSE +22 46760086 T C 0.003229515 FALSE FALSE +22 46782382 T C -0.02470821 FALSE FALSE +22 46807234 C T 0.002324176 FALSE FALSE +22 46837114 G A 0.000944073 FALSE FALSE +22 46888399 T C 0.009911095 FALSE FALSE +22 46907779 G A 0.00653144 FALSE FALSE +22 46909355 T G -0.004780494 FALSE FALSE +22 46914277 A C 0.009689535 FALSE FALSE +22 46943687 G A -0.0130366 FALSE FALSE +22 46985917 A G 0.01893397 FALSE FALSE +22 47021226 G A -0.01322949 FALSE FALSE +22 47095235 A C -0.1156013 FALSE FALSE +22 47109621 C T 0.0004322858 FALSE FALSE +22 47125474 G A -0.01746025 FALSE FALSE +22 47147117 T C -0.02418349 FALSE FALSE +22 47156703 C T 0.0262897 FALSE FALSE +22 47245836 A G 0.001880575 FALSE FALSE +22 47271747 C T 0.001055264 FALSE FALSE +22 47301822 C T 0.003032158 FALSE FALSE +22 47345487 T C -0.002945945 FALSE FALSE +22 47372368 T C 0.02067644 FALSE FALSE +22 47380606 C T 0.04041426 FALSE FALSE +22 47437808 C T 0.001683027 FALSE FALSE +22 47450911 A G 0.01624479 FALSE FALSE +22 47511864 A C -0.004226735 FALSE FALSE +22 47519476 T C -0.003954111 FALSE FALSE +22 47529458 A G -0.0003602848 FALSE FALSE +22 47531320 T C -0.006899703 FALSE FALSE +22 47548321 T C 0.004925401 FALSE FALSE +22 47568291 C T 0.007726693 FALSE FALSE +22 47571203 A G -0.009744751 FALSE FALSE +22 47574009 C T -0.00532701 FALSE FALSE +22 47642100 T C 0.006976251 FALSE FALSE +22 47657635 T C 0.001798943 FALSE FALSE +22 47683805 C T -0.03475544 FALSE FALSE +22 47720973 T C -0.007868172 FALSE FALSE +22 47821952 G A -0.000885428 FALSE FALSE +22 47893053 A G -0.02449056 FALSE FALSE +22 47935365 C T -0.001599879 FALSE FALSE +22 47961708 G T -0.003593525 FALSE FALSE +22 47986332 T C -0.003976592 FALSE FALSE +22 48154645 C T 0.007608639 FALSE FALSE +22 48165452 C CT 0.002039503 FALSE FALSE +22 48207318 T C -0.009725168 FALSE FALSE +22 48213904 G C -0.01220367 FALSE FALSE +22 48215904 A G -2.488244E-05 FALSE FALSE +22 48220460 T C -0.002702163 FALSE FALSE +22 48230941 C A -0.001129522 FALSE FALSE +22 48271961 A G -0.005053446 FALSE FALSE +22 48284025 T C -0.003344182 FALSE FALSE +22 48297953 C T -0.01046958 FALSE FALSE +22 48362290 G A -0.02367254 FALSE FALSE +22 48362914 C A -0.003167719 FALSE FALSE +22 48387670 A G -0.008243989 FALSE FALSE +22 48415446 C T 0.002130715 FALSE FALSE +22 48460730 T C 0.002682476 FALSE FALSE +22 48491160 T C 0.001257794 FALSE FALSE +22 48519794 C T 0.003680757 FALSE FALSE +22 48537775 G A 0.002134692 FALSE FALSE +22 48543566 T C 0.007314089 FALSE FALSE +22 48593037 C T 0.009084708 FALSE FALSE +22 48687509 C T -0.0277196 FALSE FALSE +22 48692033 T C -0.02126264 FALSE FALSE +22 48699617 T C 0.0005093107 FALSE FALSE +22 48717568 T C -0.0008190281 FALSE FALSE +22 48811946 C T 0.007916515 FALSE FALSE +22 48823357 G A 0.01464317 FALSE FALSE +22 48840428 A C 0.003711229 FALSE FALSE +22 48851612 T C -0.005887765 FALSE FALSE +22 48874310 T C -0.01106607 FALSE FALSE +22 48968070 C T 0.01280691 FALSE FALSE +22 48991385 T C -0.01234119 FALSE FALSE +22 49004050 G A 0.02290755 FALSE FALSE +22 49014565 A G 0.001555565 FALSE FALSE +22 49086481 T C -0.006196369 FALSE FALSE +22 49107173 T C 0.01277272 FALSE FALSE +22 49180915 A G 0.006346977 FALSE FALSE +22 49262579 A G 0.02657134 FALSE FALSE +22 49270317 C T 0.001447665 FALSE FALSE +22 49313196 A G -0.007055532 FALSE FALSE +22 49335230 T C -0.006548281 FALSE FALSE +22 49366123 T C 0.01136486 FALSE FALSE +22 49372356 G C -0.02420841 FALSE FALSE +22 49443666 T C 0.01581736 FALSE FALSE +22 49496835 G A -0.01355414 FALSE FALSE +22 49524428 A G -0.004228482 FALSE FALSE +22 49530553 G C 0.008197389 FALSE FALSE +22 49537845 T C 0.0111255 FALSE FALSE +22 49557457 G A 0.009401926 FALSE FALSE +22 49562666 C A 0.01271701 FALSE FALSE +22 49574509 C T 0.0004703177 FALSE FALSE +22 49579141 A G 0.02448619 FALSE FALSE +22 49650863 T C 0.006739571 FALSE FALSE +22 49662549 T G -0.005769464 FALSE FALSE +22 49665841 T C -0.0007037069 FALSE FALSE +22 49677464 A G -0.02177735 FALSE FALSE +22 49696067 C T -0.003309682 FALSE FALSE +22 49700272 T G -0.002541948 FALSE FALSE +22 49706433 T C -0.01719402 FALSE FALSE +22 49713835 G A -0.01370754 FALSE FALSE +22 49719264 A C -0.01067852 FALSE FALSE +22 49743627 G A -0.0005970581 FALSE FALSE +22 49800265 C T 0.03098582 FALSE FALSE +22 49806863 A G 0.003940447 FALSE FALSE +22 49830851 C T -0.002742706 FALSE FALSE +22 49834624 G A -0.002820163 FALSE FALSE +22 49843235 G C -0.0004458281 FALSE FALSE +22 49847501 T G 0.002235016 FALSE FALSE +22 49861033 C T 0.01721243 FALSE FALSE +22 49881321 A G -0.00051278 FALSE FALSE +22 49908804 G A -0.009455892 FALSE FALSE +22 49911222 G T -0.01389666 FALSE FALSE +22 49925268 A G 0.01679984 FALSE FALSE +22 49927332 T C 0.00039298 FALSE FALSE +22 50109212 T C 0.01610819 FALSE FALSE +22 50118149 G C 0.007024666 FALSE FALSE +22 50184484 G T 0.01222581 FALSE FALSE +22 50219447 T C 0.05091891 FALSE FALSE +22 50278568 G A -0.02340672 FALSE FALSE +22 50319170 G A 0.01669806 FALSE FALSE +22 50350971 A G 0.0264016 FALSE FALSE +22 50356693 C T 0.003851499 FALSE FALSE +22 50435480 G A 0.0166363 FALSE FALSE +22 50439626 A G -0.002722154 FALSE FALSE +22 50466542 C T -0.002560094 FALSE FALSE +22 50470516 T C -0.01621986 FALSE FALSE +22 50491150 G A 0.01828674 FALSE FALSE +22 50515270 C T 0.01439904 FALSE FALSE +22 50529850 C T 0.02054628 FALSE FALSE +22 50570755 C G 0.007077514 FALSE FALSE +22 50582626 G A -0.003588854 FALSE FALSE +22 50672154 A G 0.007660848 FALSE FALSE +22 50722134 C T -0.01747164 FALSE FALSE +22 50722408 C T -0.001063465 FALSE FALSE +22 50728062 C T 0.02159223 FALSE FALSE +22 50750481 T C 0.01877272 FALSE FALSE +22 50758873 T C 0.004001731 FALSE FALSE +22 50835040 A G -0.006374259 FALSE FALSE +22 50859049 C T 0.0003480749 FALSE FALSE +22 50885775 G A -0.01358311 FALSE FALSE +22 50926768 T C 0.001798498 FALSE FALSE +22 50928026 A G 0.004775504 FALSE FALSE +22 50971266 C T 0.02160893 FALSE FALSE +22 50989197 T C -0.01328884 FALSE FALSE +22 50989326 G A 0.01037054 FALSE FALSE +22 50999681 G A -0.01226224 FALSE FALSE +22 51046163 T C -0.02754002 FALSE FALSE +22 51117580 C T 0.03573542 FALSE FALSE +22 51171497 A G -0.01951606 FALSE FALSE +22 51174939 T C -0.006178519 FALSE FALSE diff --git a/tests/test_combine.py b/tests/test_combine.py index db92cc9..edfa2c2 100644 --- a/tests/test_combine.py +++ b/tests/test_combine.py @@ -1,39 +1,135 @@ +import importlib.resources +import json from unittest.mock import patch -import jq -import pandas as pd import pytest -from pgscatalog_utils.download.Catalog import CatalogQuery, CatalogResult -from pgscatalog_utils.download.CatalogCategory import CatalogCategory from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles +from tests.data import combine -def test_combine_scorefiles(combined_scorefile, _n_variants): - df = pd.read_table(combined_scorefile) - cols = {'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', - 'is_duplicated', 'accession', 'row_nr'} - assert set(df.columns).issubset(cols) - assert df.shape[0] == _n_variants +def test_pgscatalog_combine(pgscatalog_path, tmp_path_factory, combine_output_header): + out_path = tmp_path_factory.mktemp("scores") / "combined.txt" + args: list[str] = ( + ["combine_scorefiles", "-t", "GRCh37", "-s"] + + [str(pgscatalog_path)] + + ["-o", str(out_path.resolve())] + ) + with patch("sys.argv", args): + combine_scorefiles() + with open(out_path) as f: + for i, line in enumerate(f): + if i == 0: + cols = line.strip().split("\t") + else: + break + assert not set(cols).difference(set(combine_output_header)) -def test_liftover(lifted_scorefiles): - df = pd.read_table(lifted_scorefiles) - assert df.shape[0] == 832 # approx size + with open(out_path.parent / "log_combined.json") as f: + header = json.load(f)[0] + assert header["PGS001229_22"]["pgs_id"] == "PGS001229" + assert header["PGS001229_22"]["pgs_name"] == "GBE_INI50" + assert header["PGS001229_22"]["trait_mapped"] == ["body height"] + assert header["PGS001229_22"]["trait_efo"] == ["EFO_0004339"] + assert header["PGS001229_22"]["genome_build"] == "GRCh37" + assert not header["PGS001229_22"]["use_harmonised"] -def test_fail_combine(scorefiles, tmp_path_factory): +def test_effect_type_combine(effect_type_path, tmp_path_factory, combine_output_header): # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception - with pytest.raises(Exception): - out_path = tmp_path_factory.mktemp("scores") / "combined.txt" - args: list[str] = ['combine_scorefiles', '-t', 'GRCh38', '-s'] + scorefiles + ['-o', str(out_path.resolve())] - with patch('sys.argv', args): - combine_scorefiles() + out_path = tmp_path_factory.mktemp("scores") / "combined.txt" + args: list[str] = ( + ["combine_scorefiles", "-t", "GRCh37", "-s"] + + [str(effect_type_path)] + + ["-o", str(out_path.resolve())] + ) + with patch("sys.argv", args): + combine_scorefiles() + + n = -1 # skip header line + with open(out_path) as f: + for i, line in enumerate(f): + if i == 0: + cols = line.strip().split("\t") + + if i == 1: + assert line.strip().split("\t")[-3] == "dominant" + + if i == 2: + assert line.strip().split("\t")[-3] == "recessive" + + n += 1 + + assert not set(cols).difference(set(combine_output_header)) + + with open(out_path.parent / "log_combined.json") as f: + header = json.load(f)[0] + assert ( + header["scorefile_dominant_and_recessive"]["pgs_name"] + == "PGS001229_22_DominantRecessiveExample" + ) + assert header["scorefile_dominant_and_recessive"]["genome_build"] == "GRCh37" + assert header["scorefile_dominant_and_recessive"]["variants_number"] == n + assert not header["scorefile_dominant_and_recessive"]["use_harmonised"] + + +def test_custom_combine(custom_score_path, tmp_path_factory, combine_output_header): + # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception + out_path = tmp_path_factory.mktemp("scores") / "combined.txt" + args: list[str] = ( + ["combine_scorefiles", "-t", "GRCh37", "-s"] + + [str(custom_score_path)] + + ["-o", str(out_path.resolve())] + ) + + with patch("sys.argv", args): + combine_scorefiles() + + # read combined file + n = -1 # skip header line + with open(out_path) as f: + for i, line in enumerate(f): + if i == 0: + cols = line.strip().split("\t") + n += 1 + assert not set(cols).difference(set(combine_output_header)) + + with open(out_path.parent / "log_combined.json") as f: + header = json.load(f)[0] + assert header["scorefile"]["pgs_name"] == "PGS001229_22" + assert header["scorefile"]["genome_build"] == "GRCh37" + assert header["scorefile"]["variants_number"] == n + assert not header["scorefile"]["use_harmonised"] + + +@pytest.fixture +def pgscatalog_path(scope="session"): + path = importlib.resources.files(combine) / "PGS001229_22.txt" + return path @pytest.fixture -def _n_variants(pgs_accessions): - result = CatalogQuery(CatalogCategory.SCORE, accession=pgs_accessions).get()[0] - json = result.response - n: list[int] = jq.compile("[.results][][].variants_number").input(json).all() - return sum(n) +def effect_type_path(scope="session"): + path = importlib.resources.files(combine) / "scorefile_dominant_and_recessive.txt" + return path + + +@pytest.fixture(scope="session") +def custom_score_path(tmp_path_factory): + path = importlib.resources.files(combine) / "scorefile.txt" + return path + + +@pytest.fixture(scope="session") +def combine_output_header(): + return [ + "chr_name", + "chr_position", + "effect_allele", + "other_allele", + "effect_weight", + "effect_type", + "accession", + "row_nr", + ] diff --git a/tests/test_liftover.py b/tests/test_liftover.py index b2f03a0..71f5d5f 100644 --- a/tests/test_liftover.py +++ b/tests/test_liftover.py @@ -1,9 +1,41 @@ -import pandas as pd +import copy -from pgscatalog_utils.scorefile.liftover import liftover +from pgscatalog_utils.scorefile.config import Config + +from pgscatalog_utils.download.GenomeBuild import GenomeBuild +from pgscatalog_utils.scorefile.liftover import liftover, create_liftover def test_liftover(hg38_coords, hg19_coords, chain_files): - lifted = liftover(hg38_coords, chain_files, min_lift=0.9, target_build='GRCh37') - coords: pd.DataFrame = hg19_coords[['lifted_pos', 'lifted_chr']] == lifted[['lifted_pos', 'lifted_chr']] - assert coords.all(axis=None) + Config.chain_dir = chain_files + Config.lo = create_liftover() + Config.min_lift = 0.95 + + hg38 = copy.deepcopy(hg38_coords) + lifted = list( + liftover( + hg38, + harmonised=False, + current_build=GenomeBuild.GRCh38, + target_build=GenomeBuild.GRCh37, + ) + ) + + assert [x["chr_position"] for x in lifted] == [ + x["chr_position"] for x in hg19_coords + ] + assert [x["chr_name"] for x in lifted] == [x["chr_name"] for x in hg19_coords] + + hg19 = copy.deepcopy(hg19_coords) + lift_back = list( + liftover( + hg19, + harmonised=False, + current_build=GenomeBuild.GRCh37, + target_build=GenomeBuild.GRCh38, + ) + ) + assert [x["chr_position"] for x in lift_back] == [ + x["chr_position"] for x in hg38_coords + ] + assert [x["chr_name"] for x in lift_back] == [x["chr_name"] for x in hg38_coords] From 032b1f70e067b280ee842fb5f9256dc8765d958a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 3 Nov 2023 13:13:18 +0000 Subject: [PATCH 13/40] fix test --- conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index ba3e065..d5d037c 100644 --- a/conftest.py +++ b/conftest.py @@ -25,7 +25,7 @@ def pgs_accessions(): @pytest.fixture(scope="session") def mini_score_path(tmp_path_factory): path = importlib.resources.files(combine) / "PGS001229_22.txt" - return path + return str(path) @pytest.fixture(scope="session") From 7864b7df689d6c9d995a55030f8c1ee68e880a50 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 3 Nov 2023 16:54:07 +0000 Subject: [PATCH 14/40] sqlite support and add log data --- .../scorefile/combine_scorefiles.py | 16 +-- pgscatalog_utils/scorefile/scoringfile.py | 14 +- pgscatalog_utils/scorefile/write.py | 130 +++++++++++++----- 3 files changed, 114 insertions(+), 46 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index fbd3082..fef762a 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -4,7 +4,6 @@ import pathlib import sys import textwrap -import time from pgscatalog_utils.config import set_logging_level from pgscatalog_utils.download.GenomeBuild import GenomeBuild @@ -34,7 +33,6 @@ def combine_scorefiles(): paths: list[str] = list(set(args.scorefiles)) # unique paths only logger.debug(f"Input scorefiles: {paths}") - start_time = time.time() sfs = [ScoringFile.from_path(x) for x in paths] target_build = GenomeBuild.from_string(args.target_build) @@ -46,20 +44,16 @@ def combine_scorefiles(): else: logger.info(f"All builds match target build {target_build}") - line_counts: dict[str, int] = write_combined(sfs, args.outfile) # provide line counts when making the scoring files - log = [] - for (k, v), sf in zip(line_counts.items(), sfs): - log.append(sf.generate_log(v)) + logs: dict[str, int] = write_combined(sfs, args.outfile) + json_log = [] + for (k, v), sf in zip(logs.items(), sfs): + json_log.append(sf.generate_log(v)) log_out_path = pathlib.Path(args.outfile).parent / args.logfile with open(log_out_path, "w") as f: logger.info(f"Writing log to {f.name}") - json.dump(log, f, indent=4) - - end_time = time.time() - elapsed_time = end_time - start_time - print(f"Elapsed time: {elapsed_time} seconds") + json.dump(json_log, f, indent=4) def _description_text() -> str: diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index 1b44c6c..f3eec11 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -77,7 +77,7 @@ def from_path(cls, path: pathlib.Path): accession=name, ) - def generate_log(self, line_count: int): + def generate_log(self, counted: typing.Counter): log = { key: str(value) if value is not None else None for key, value in self.header.__dict__.items() @@ -85,7 +85,15 @@ def generate_log(self, line_count: int): if log["variants_number"] is None: # custom scoring files might not have this information - log["variants_number"] = line_count + 1 # (0 indexed) + log["variants_number"] = counted["n_variants"] + 1 # (0 indexed) + + if ( + int(log["variants_number"]) != counted["n_variants"] + and not Config.drop_missing + ): + raise Exception( + f"Mismatch between variants_number and counted output {self.accession}" + ) # multiple terms may be separated with a pipe if log["trait_mapped"]: @@ -97,6 +105,7 @@ def generate_log(self, line_count: int): log["columns"] = self.fields log["use_liftover"] = Config.liftover log["use_harmonised"] = self.harmonised + log["sources"] = [k for k, v in counted.items() if k != "n_variants"] return {self.accession: log} @@ -147,6 +156,7 @@ def read_rows(csv_reader, fields: list[str], name: str, row_nr: int, wide: bool) "hm_chr", "hm_pos", "hm_inferOtherAllele", + "hm_source", "is_dominant", "is_recessive", "accession", diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index d0a32bb..1cd8826 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -2,6 +2,10 @@ import functools import gzip import logging +import os +import sqlite3 +import typing +from collections import Counter from itertools import islice import pgzip @@ -12,22 +16,10 @@ logger = logging.getLogger(__name__) -def write_combined(scoring_files: list[ScoringFile], out_path: str): - # compresslevel can be really slow, default is 9 - if out_path.endswith("gz") and Config.threads == 1: - logger.info("Writing with gzip (slow)") - open_function = functools.partial(gzip.open, compresslevel=6) - elif Config.threads > 1: - logger.info("Writing with pgzip (fast)") - open_function = functools.partial( - pgzip.open, compresslevel=6, thread=Config.threads, blocksize=2 * 10**8 - ) - else: - logger.info("Writing text file (fast)") - open_function = open - - with open_function(out_path, mode="wt") as f: - fieldnames = [ +class DataWriter: + def __init__(self, filename): + self.filename = filename + self.fieldnames = [ "chr_name", "chr_position", "effect_allele", @@ -37,21 +29,93 @@ def write_combined(scoring_files: list[ScoringFile], out_path: str): "accession", "row_nr", ] - writer = csv.DictWriter( - f, fieldnames=fieldnames, delimiter="\t", extrasaction="ignore" + + def write(self, batch): + pass + + +class TextFileWriter(DataWriter): + def __init__(self, compress, filename): + super().__init__(filename) + self.compress = compress + + def write(self, batch): + if self.compress and Config.threads == 1: + logger.info("Writing with gzip (slow)") + open_function = functools.partial(gzip.open, compresslevel=6) + elif self.compress and Config.threads > 1: + logger.info("Writing with pgzip (fast)") + open_function = functools.partial( + pgzip.open, compresslevel=6, thread=Config.threads, blocksize=2 * 10**8 + ) + else: + logger.info("Writing text file (fast)") + open_function = open + + mode = "at" if os.path.exists(self.filename) else "wt" + with open_function(self.filename, mode) as f: + writer = csv.DictWriter( + f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore" + ) + if mode == "w": + writer.writeheader() + writer.writerows(batch) + + +class SqliteWriter(DataWriter): + def __init__(self, filename): + super().__init__(filename) + + def write(self, batch): + conn = sqlite3.connect(self.filename) + cursor = conn.cursor() + placeholders = ", ".join("?" for _ in self.fieldnames) + + values = [ + tuple(row[key] for key in self.fieldnames if key in row) for row in batch + ] + + cursor.execute( + f"CREATE TABLE IF NOT EXISTS variants ({', '.join(self.fieldnames)})" ) - writer.writeheader() - - line_counts = {} - # write out in batches for compression efficiency and speed - for scoring_file in scoring_files: - logger.info(f"Writing {scoring_file.accession} variants") - while True: - batch = list(islice(scoring_file.variants, Config.batch_size)) - if not batch: - break - # calculate max row_nr now because it's when we finally generate variants - line_counts[scoring_file.accession] = max(x["row_nr"] for x in batch) - writer.writerows(batch) - - return line_counts + cursor.executemany(f"INSERT INTO variants VALUES ({placeholders})", values) + conn.commit() + conn.close() + + +def write_combined( + scoring_files: list[ScoringFile], out_path: str +) -> dict[str : typing.Counter]: + # compresslevel can be really slow, default is 9 + if out_path.endswith("gz"): + writer = TextFileWriter(compress=True, filename=out_path) + elif out_path.endswith("txt"): + writer = TextFileWriter(compress=False, filename=out_path) + elif out_path.endswith(".sqlite"): + writer = SqliteWriter(filename=out_path) + else: + raise Exception("Can't configure writer, please check out_path") + + counts = [] + log = {} + for scoring_file in scoring_files: + logger.info(f"Writing {scoring_file.accession} variants") + while True: + batch = list(islice(scoring_file.variants, Config.batch_size)) + if not batch: + break + writer.write(batch=batch) + counts = calculate_log(batch, counts) + + log[scoring_file.accession] = sum(counts, Counter()) + counts = [] + + return log + + +def calculate_log(batch, log: list[Counter]) -> list[Counter]: + # these statistics can only be generated while iterating through variants + n_variants = Counter("n_variants" for item in batch) + hm_source = Counter(item["hm_source"] for item in batch if "hm_source" in item) + log.extend([n_variants, hm_source]) + return log From afdc0b138c4ab6bedfec47fa77cac66c883c99be Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 6 Nov 2023 10:51:58 +0000 Subject: [PATCH 15/40] fix tests --- pgscatalog_utils/scorefile/qc.py | 21 ++++++------- pgscatalog_utils/scorefile/scoringfile.py | 10 ++++-- pgscatalog_utils/scorefile/write.py | 2 +- tests/test_combine.py | 38 ++++++++--------------- 4 files changed, 30 insertions(+), 41 deletions(-) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 8454fe4..7db5da0 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -19,9 +19,7 @@ def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide: variants = assign_other_allele(variants) if wide: - # wide data must be sorted because: - # - check_duplicates requires sorted input - # - output would be unsorted, which looks a little bit messy + # wide data must be sorted because check_duplicates requires sorted input variants = (x for x in sorted(variants, key=lambda x: x["accession"])) variants = check_duplicates(variants) @@ -89,10 +87,10 @@ def check_effect_weight(variants): for variant in variants: try: variant["effect_weight"] = float(variant["effect_weight"]) + yield variant except ValueError: logger.critical(f"{variant} has bad effect weight") raise ValueError - yield variant def assign_other_allele(variants): @@ -115,17 +113,16 @@ def assign_other_allele(variants): def assign_effect_type(variants): for variant in variants: - if "is_recessive" not in variant and "is_dominant" not in variant: - variant["effect_type"] = "additive" - else: - if variant["is_recessive"] == "TRUE": - variant["effect_type"] = "recessive" - elif variant["is_dominant"] == "TRUE": + match (variant.get("is_recessive"), variant.get("is_dominant")): + case (None, None) | ("FALSE", "FALSE"): + variant["effect_type"] = "additive" + case ("FALSE", "TRUE"): variant["effect_type"] = "dominant" - elif variant["is_recessive"] == "TRUE" and variant["is_dominant"] == "TRUE": + case ("TRUE", "FALSE"): + variant["effect_type"] = "recessive" + case _: logger.critical(f"Bad effect type setting: {variant}") raise Exception - yield variant diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index f3eec11..2b85e4c 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -6,9 +6,8 @@ from dataclasses import dataclass from itertools import islice -from pgscatalog_utils.scorefile.config import Config - from pgscatalog_utils.download.GenomeBuild import GenomeBuild +from pgscatalog_utils.scorefile.config import Config from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open from pgscatalog_utils.scorefile.qc import quality_control @@ -85,7 +84,7 @@ def generate_log(self, counted: typing.Counter): if log["variants_number"] is None: # custom scoring files might not have this information - log["variants_number"] = counted["n_variants"] + 1 # (0 indexed) + log["variants_number"] = counted["n_variants"] if ( int(log["variants_number"]) != counted["n_variants"] @@ -171,6 +170,11 @@ def read_rows(csv_reader, fields: list[str], name: str, row_nr: int, wide: bool) row_nr += 1 +def parse_dict(variants): + # TODO: use best data types when parsing lines + pass + + def get_columns(path) -> tuple[int, list[str]]: open_function = auto_open(path) with open_function(path, mode="rt") as f: diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 1cd8826..54214fb 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -57,7 +57,7 @@ def write(self, batch): writer = csv.DictWriter( f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore" ) - if mode == "w": + if mode == "wt": writer.writeheader() writer.writerows(batch) diff --git a/tests/test_combine.py b/tests/test_combine.py index edfa2c2..109ad6d 100644 --- a/tests/test_combine.py +++ b/tests/test_combine.py @@ -8,37 +8,25 @@ from tests.data import combine -def test_pgscatalog_combine(pgscatalog_path, tmp_path_factory, combine_output_header): - out_path = tmp_path_factory.mktemp("scores") / "combined.txt" +def test_pgscatalog_combine(pgscatalog_path, tmp_path, combine_output_header): + out_path = tmp_path / "combined.txt" args: list[str] = ( ["combine_scorefiles", "-t", "GRCh37", "-s"] + [str(pgscatalog_path)] + ["-o", str(out_path.resolve())] ) - with patch("sys.argv", args): - combine_scorefiles() - with open(out_path) as f: - for i, line in enumerate(f): - if i == 0: - cols = line.strip().split("\t") - else: - break - assert not set(cols).difference(set(combine_output_header)) + # this mismatch occurs because header is from original PGS (~50,000) + # but variants are only from chr22 (~850) + with pytest.raises(Exception) as e: + with patch("sys.argv", args): + combine_scorefiles() + assert "Mismatch between variants_number and counted output" in str(e.value) - with open(out_path.parent / "log_combined.json") as f: - header = json.load(f)[0] - assert header["PGS001229_22"]["pgs_id"] == "PGS001229" - assert header["PGS001229_22"]["pgs_name"] == "GBE_INI50" - assert header["PGS001229_22"]["trait_mapped"] == ["body height"] - assert header["PGS001229_22"]["trait_efo"] == ["EFO_0004339"] - assert header["PGS001229_22"]["genome_build"] == "GRCh37" - assert not header["PGS001229_22"]["use_harmonised"] - -def test_effect_type_combine(effect_type_path, tmp_path_factory, combine_output_header): +def test_effect_type_combine(effect_type_path, tmp_path, combine_output_header): # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception - out_path = tmp_path_factory.mktemp("scores") / "combined.txt" + out_path = tmp_path / "combined.txt" args: list[str] = ( ["combine_scorefiles", "-t", "GRCh37", "-s"] + [str(effect_type_path)] @@ -74,9 +62,9 @@ def test_effect_type_combine(effect_type_path, tmp_path_factory, combine_output_ assert not header["scorefile_dominant_and_recessive"]["use_harmonised"] -def test_custom_combine(custom_score_path, tmp_path_factory, combine_output_header): +def test_custom_combine(custom_score_path, tmp_path, combine_output_header): # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception - out_path = tmp_path_factory.mktemp("scores") / "combined.txt" + out_path = tmp_path / "combined.txt" args: list[str] = ( ["combine_scorefiles", "-t", "GRCh37", "-s"] + [str(custom_score_path)] @@ -92,8 +80,8 @@ def test_custom_combine(custom_score_path, tmp_path_factory, combine_output_head for i, line in enumerate(f): if i == 0: cols = line.strip().split("\t") + assert not set(cols).difference(set(combine_output_header)) n += 1 - assert not set(cols).difference(set(combine_output_header)) with open(out_path.parent / "log_combined.json") as f: header = json.load(f)[0] From 1302d7a27cba02a48e0c0d7a64a258fca3b0827b Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 6 Nov 2023 12:55:06 +0000 Subject: [PATCH 16/40] fix tests --- .gitignore | 3 ++- tests/data/combine/PGS001229_22.txt | 2 +- tests/test_combine.py | 24 ++++++++++++++++++------ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index b0b6f3a..5ee9a36 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,5 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ \ No newline at end of file +.idea/ +.DS_Store diff --git a/tests/data/combine/PGS001229_22.txt b/tests/data/combine/PGS001229_22.txt index 5f791f4..4084c13 100644 --- a/tests/data/combine/PGS001229_22.txt +++ b/tests/data/combine/PGS001229_22.txt @@ -8,7 +8,7 @@ #trait_efo=EFO_0004339 #weight_type=NR #genome_build=GRCh37 -#variants_number=51209 +#variants_number=835 ##SOURCE INFORMATION #pgp_id=PGP000244 #citation=Tanigawa Y et al. medRxiv (2021). doi:10.1101/2021.09.02.21262942 diff --git a/tests/test_combine.py b/tests/test_combine.py index 109ad6d..f87b5ca 100644 --- a/tests/test_combine.py +++ b/tests/test_combine.py @@ -16,12 +16,24 @@ def test_pgscatalog_combine(pgscatalog_path, tmp_path, combine_output_header): + ["-o", str(out_path.resolve())] ) - # this mismatch occurs because header is from original PGS (~50,000) - # but variants are only from chr22 (~850) - with pytest.raises(Exception) as e: - with patch("sys.argv", args): - combine_scorefiles() - assert "Mismatch between variants_number and counted output" in str(e.value) + with patch("sys.argv", args): + combine_scorefiles() + + n = -1 # skip header line + with open(out_path) as f: + for i, line in enumerate(f): + if i == 0: + cols = line.strip().split("\t") + assert not set(cols).difference(set(combine_output_header)) + n += 1 + + with open(out_path.parent / "log_combined.json") as f: + header = json.load(f)[0] + assert header["PGS001229_22"]["pgs_id"] == "PGS001229" + assert header["PGS001229_22"]["pgs_name"] == "GBE_INI50" + assert header["PGS001229_22"]["genome_build"] == "GRCh37" + assert int(header["PGS001229_22"]["variants_number"]) == n + assert not header["PGS001229_22"]["use_harmonised"] def test_effect_type_combine(effect_type_path, tmp_path, combine_output_header): From 953497ac8542c0156880075bd0070e37a1d30ebd Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 6 Nov 2023 16:03:10 +0000 Subject: [PATCH 17/40] fixes to make old and new output consistent --- .../scorefile/combine_scorefiles.py | 3 +++ pgscatalog_utils/scorefile/qc.py | 2 +- pgscatalog_utils/scorefile/scoringfile.py | 11 ++++----- pgscatalog_utils/scorefile/write.py | 24 ++++++++++++------- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index fef762a..5757017 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -30,6 +30,9 @@ def combine_scorefiles(): Config.chain_dir = args.chain_dir Config.lo = create_liftover() + if pathlib.Path(args.outfile).exists(): + raise FileExistsError(f"{args.outfile}") + paths: list[str] = list(set(args.scorefiles)) # unique paths only logger.debug(f"Input scorefiles: {paths}") diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 7db5da0..e881082 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -86,7 +86,7 @@ def drop_hla(variants): def check_effect_weight(variants): for variant in variants: try: - variant["effect_weight"] = float(variant["effect_weight"]) + float(variant["effect_weight"]) yield variant except ValueError: logger.critical(f"{variant} has bad effect weight") diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index 2b85e4c..2479801 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -111,9 +111,7 @@ def generate_log(self, counted: typing.Counter): @staticmethod def read_variants(path, fields, start_line, name: str, is_wide: bool): open_function = auto_open(path) - # row_nr and cum_batch are equivalent but - row_nr = 0 # important to increment in sub-generator for each line - cum_batch = 0 # sums batches in this function + row_nr = 0 with open_function(path, mode="rt") as f: for _ in range(start_line + 1): @@ -122,15 +120,16 @@ def read_variants(path, fields, start_line, name: str, is_wide: bool): while True: batch = list(islice(f, Config.batch_size)) - cum_batch += len(batch) if not batch: break csv_reader = csv.reader(batch, delimiter="\t") - yield from read_rows(csv_reader, fields, name, row_nr, is_wide) + yield from read_rows(csv_reader, fields, name, is_wide, row_nr) + # this is important for row_nr resets for each batch + row_nr += len(batch) -def read_rows(csv_reader, fields: list[str], name: str, row_nr: int, wide: bool): +def read_rows(csv_reader, fields: list[str], name: str, wide: bool, row_nr: int): for row in csv_reader: variant = dict(zip(fields, row)) diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 54214fb..1404560 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -26,9 +26,11 @@ def __init__(self, filename): "other_allele", "effect_weight", "effect_type", + "is_duplicated", "accession", "row_nr", ] + logger.info(f"Output filename: {filename}") def write(self, batch): pass @@ -39,23 +41,27 @@ def __init__(self, compress, filename): super().__init__(filename) self.compress = compress - def write(self, batch): if self.compress and Config.threads == 1: - logger.info("Writing with gzip (slow)") - open_function = functools.partial(gzip.open, compresslevel=6) + logger.info("Writing with gzip") + self.open_function = functools.partial(gzip.open, compresslevel=6) elif self.compress and Config.threads > 1: - logger.info("Writing with pgzip (fast)") - open_function = functools.partial( + logger.info("Writing with pgzip") + self.open_function = functools.partial( pgzip.open, compresslevel=6, thread=Config.threads, blocksize=2 * 10**8 ) else: - logger.info("Writing text file (fast)") - open_function = open + logger.info("Writing text file") + self.open_function = open + def write(self, batch): mode = "at" if os.path.exists(self.filename) else "wt" - with open_function(self.filename, mode) as f: + with self.open_function(self.filename, mode) as f: writer = csv.DictWriter( - f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore" + f, + fieldnames=self.fieldnames, + delimiter="\t", + extrasaction="ignore", + lineterminator="\n", ) if mode == "wt": writer.writeheader() From cf3fc8bb9303e0af62bd75b9f5253b253494d34e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 6 Nov 2023 16:41:04 +0000 Subject: [PATCH 18/40] update tests --- tests/test_combine.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/test_combine.py b/tests/test_combine.py index f87b5ca..bc82faf 100644 --- a/tests/test_combine.py +++ b/tests/test_combine.py @@ -1,3 +1,4 @@ +import csv import importlib.resources import json from unittest.mock import patch @@ -47,17 +48,16 @@ def test_effect_type_combine(effect_type_path, tmp_path, combine_output_header): with patch("sys.argv", args): combine_scorefiles() - n = -1 # skip header line with open(out_path) as f: - for i, line in enumerate(f): - if i == 0: - cols = line.strip().split("\t") + n = 0 + for line in csv.DictReader(f, delimiter="\t"): + cols = list(line.keys()) - if i == 1: - assert line.strip().split("\t")[-3] == "dominant" + if int(line["row_nr"]) == 0: + assert line["effect_type"] == "dominant" - if i == 2: - assert line.strip().split("\t")[-3] == "recessive" + if int(line["row_nr"]) == 1: + assert line["effect_type"] == "recessive" n += 1 @@ -130,6 +130,7 @@ def combine_output_header(): "other_allele", "effect_weight", "effect_type", + "is_duplicated", "accession", "row_nr", ] From d0fcb8da4d13589b6e956e5db976c6882d46e977 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 7 Nov 2023 11:43:13 +0000 Subject: [PATCH 19/40] drop parallel gzip and --threads --- pgscatalog_utils/scorefile/combine_scorefiles.py | 4 ---- pgscatalog_utils/scorefile/config.py | 1 - pgscatalog_utils/scorefile/header.py | 15 ++------------- pgscatalog_utils/scorefile/write.py | 9 +-------- 4 files changed, 3 insertions(+), 26 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 5757017..2532f8b 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -19,7 +19,6 @@ def combine_scorefiles(): logger = logging.getLogger(__name__) set_logging_level(args.verbose) - Config.threads = args.threads Config.batch_size = 20000 Config.drop_missing = args.drop_missing Config.target_build = GenomeBuild.from_string(args.target_build) @@ -126,9 +125,6 @@ def _parse_args(args=None) -> argparse.Namespace: default=0.95, type=float, ) - parser.add_argument( - "--threads", dest="threads", required=False, default=1, type=int - ) parser.add_argument( "--drop_missing", dest="drop_missing", diff --git a/pgscatalog_utils/scorefile/config.py b/pgscatalog_utils/scorefile/config.py index 2725110..a7540fc 100644 --- a/pgscatalog_utils/scorefile/config.py +++ b/pgscatalog_utils/scorefile/config.py @@ -7,7 +7,6 @@ @dataclass class Config: - threads: int drop_missing: bool liftover: bool lo: pyliftover.liftover diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py index e9a03e4..82ea79d 100644 --- a/pgscatalog_utils/scorefile/header.py +++ b/pgscatalog_utils/scorefile/header.py @@ -1,11 +1,7 @@ -import functools import gzip import pathlib from dataclasses import dataclass -from pgscatalog_utils.scorefile.config import Config -from pgzip import pgzip - from pgscatalog_utils.download.GenomeBuild import GenomeBuild @@ -80,13 +76,6 @@ def _gen_header_lines(f): def auto_open(filepath): with open(filepath, "rb") as test_f: if test_f.read(2) == b"\x1f\x8b": - gzipped = True + return gzip.open else: - gzipped = False - - if gzipped and Config.threads > 1: - return functools.partial(pgzip.open, thread=Config.threads) - elif gzipped: - return gzip.open - elif not gzipped: - return open + return open diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 1404560..57ceb91 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -8,8 +8,6 @@ from collections import Counter from itertools import islice -import pgzip - from pgscatalog_utils.scorefile.config import Config from pgscatalog_utils.scorefile.scoringfile import ScoringFile @@ -41,14 +39,9 @@ def __init__(self, compress, filename): super().__init__(filename) self.compress = compress - if self.compress and Config.threads == 1: + if self.compress: logger.info("Writing with gzip") self.open_function = functools.partial(gzip.open, compresslevel=6) - elif self.compress and Config.threads > 1: - logger.info("Writing with pgzip") - self.open_function = functools.partial( - pgzip.open, compresslevel=6, thread=Config.threads, blocksize=2 * 10**8 - ) else: logger.info("Writing text file") self.open_function = open From e9e06e1dacf861ac7eeba64dd2b376b789a5e874 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 16 Nov 2023 16:30:26 +0000 Subject: [PATCH 20/40] create ScoreVariant and EffectType classes --- pgscatalog_utils/scorefile/effecttype.py | 10 +++ pgscatalog_utils/scorefile/scorevariant.py | 72 ++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 pgscatalog_utils/scorefile/effecttype.py create mode 100644 pgscatalog_utils/scorefile/scorevariant.py diff --git a/pgscatalog_utils/scorefile/effecttype.py b/pgscatalog_utils/scorefile/effecttype.py new file mode 100644 index 0000000..0d51f14 --- /dev/null +++ b/pgscatalog_utils/scorefile/effecttype.py @@ -0,0 +1,10 @@ +from enum import Enum + + +class EffectType(Enum): + RECESSIVE = "recessive" + DOMINANT = "dominant" + ADDITIVE = "additive" + + def __str__(self): + return str(self.value) diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py new file mode 100644 index 0000000..3b258a6 --- /dev/null +++ b/pgscatalog_utils/scorefile/scorevariant.py @@ -0,0 +1,72 @@ +""" +This module contains the class ScoreVariant, which is a custom dictionary used to consistently represent rows in a PGS Catalog scoring file +""" +import collections + +from pgscatalog_utils.scorefile.effecttype import EffectType + + +class ScoreVariant(collections.UserDict): + """A single variant from a scoring file structured to follow PGS Catalog standards, + typically extracted from a row in a scoring file. + + See https://www.pgscatalog.org/downloads/#dl_scoring_files for field descriptions. + + This class is intentionally simple (a dict that checks for mandatory keys and fills + optional keys) because a more complicated __init__ will be slow when lots of variants + are read from a file. dicts use fast C magic, so try not to interfere too much. + + Some additional keys are included for quality control: + - accession: a unique identifier to group variants in the same score) + - row_nr: an incrementing integer, used to track the number of variants in an accession + - is_duplicated: a label to mark variants with the same coordinates and alleles + - effect_type: additive, recessive, or dominant + + >>> variant = ScoreVariant(**{"chr_name": "1", "chr_position": 1, "effect_allele": "A", "other_allele": "G", "effect_weight": 0.5, "accession": "PGS000822", "row_nr": 0}) + >>> variant + {'chr_name': '1', 'chr_position': 1, 'effect_allele': 'A', 'other_allele': 'G', 'effect_weight': 0.5, 'accession': 'PGS000822', 'row_nr': 0, 'rsID': None, 'hm_chr': None, 'hm_pos': None, 'hm_inferOtherAllele': None, 'hm_source': None, 'is_dominant': None, 'is_recessive': None, 'hm_rsID': None, 'hm_match_chr': None, 'hm_match_pos': None, 'is_duplicated': None, 'effect_type': } + + Mandatory data fields match PGS Catalog harmonised data standards: + + >>> ScoreVariant(**{"chr_name": "1", "chr_position": 1}) + Traceback (most recent call last): + ... + ValueError: Mandatory field 'effect_allele' is missing. + """ + + mandatory_fields: tuple[str] = ( + "chr_name", + "chr_position", + "effect_allele", + "effect_weight", + "accession", + "row_nr", + ) + optional_fields: tuple[str] = ( + "rsID", + "other_allele", + "hm_chr", + "hm_pos", + "hm_inferOtherAllele", + "hm_source", + "is_dominant", + "is_recessive", + "hm_rsID", + "hm_match_chr", + "hm_match_pos", + "is_duplicated", + ) + + def __init__(self, **kwargs): + super().__init__(**kwargs) # creates the dict + + for field in self.mandatory_fields: + if field not in self.data: + raise ValueError(f"Mandatory field '{field}' is missing.") + + # set most optional fields to None... + for field in self.optional_fields: + self.data.setdefault(field, None) + + # ... except effect type, as the vast majority of variants are additive + self.data.setdefault("effect_type", EffectType.ADDITIVE) From eef2da6736cbc3126ecde102f7b85e7f11c0f3f7 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 16 Nov 2023 16:31:37 +0000 Subject: [PATCH 21/40] review comments --- pgscatalog_utils/scorefile/liftover.py | 2 + pgscatalog_utils/scorefile/qc.py | 97 ++++++++++++----------- pgscatalog_utils/scorefile/scoringfile.py | 45 +++-------- 3 files changed, 65 insertions(+), 79 deletions(-) diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py index 8097b70..255fcf2 100644 --- a/pgscatalog_utils/scorefile/liftover.py +++ b/pgscatalog_utils/scorefile/liftover.py @@ -46,6 +46,8 @@ def liftover( yield variant n_lifted += 1 else: + variant["chr_name"] = None + variant["chr_position"] = None variant["lifted"] = False yield variant n += 1 diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index e881082..d98acee 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -2,6 +2,7 @@ import typing from pgscatalog_utils.scorefile.config import Config +from pgscatalog_utils.scorefile.effecttype import EffectType from pgscatalog_utils.scorefile.header import ScoringFileHeader from pgscatalog_utils.scorefile.liftover import liftover @@ -9,7 +10,23 @@ def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide: bool): + # order is important for: + # 1. liftover non-harmonised data (quite rare), failed lifts get None'd + # 2. remap harmonised data, failed harmonisations get None'd + # 3. check and optionally drop bad variants + # where a bad variant has None in a mandatory ScoreVariant field + # then continue with other QC + + if Config.liftover: + variants = liftover( + variants, + harmonised=harmonised, + current_build=header.genome_build, + target_build=Config.target_build, + ) + variants = remap_harmonised(variants, harmonised) + variants = check_bad_variant(variants) if Config.drop_missing: variants = drop_hla(variants) @@ -24,14 +41,6 @@ def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide: variants = check_duplicates(variants) - if Config.liftover: - variants = liftover( - variants, - harmonised=harmonised, - current_build=header.genome_build, - target_build=Config.target_build, - ) - return variants @@ -75,10 +84,12 @@ def check_duplicates(variants): def drop_hla(variants): n_dropped = 0 for variant in variants: - if variant["effect_allele"] != "P" or variant["effect_allele"] != "N": - yield variant - else: - n_dropped += 1 + match variant: + case {"effect_allele": "P"} | {"effect_allele": "N"}: + n_dropped += 1 + continue + case _: + yield variant logger.warning(f"{n_dropped} HLA alleles detected and dropped") @@ -96,12 +107,8 @@ def check_effect_weight(variants): def assign_other_allele(variants): n_dropped = 0 for variant in variants: - if "other_allele" in variant: - if "/" in variant["other_allele"]: - # drop multiple other alleles - n_dropped += 1 - variant["other_allele"] = None - else: + if "/" in variant["other_allele"]: + n_dropped += 1 variant["other_allele"] = None yield variant @@ -115,11 +122,11 @@ def assign_effect_type(variants): for variant in variants: match (variant.get("is_recessive"), variant.get("is_dominant")): case (None, None) | ("FALSE", "FALSE"): - variant["effect_type"] = "additive" + pass # default value is additive case ("FALSE", "TRUE"): - variant["effect_type"] = "dominant" + variant["effect_type"] = EffectType.DOMINANT case ("TRUE", "FALSE"): - variant["effect_type"] = "recessive" + variant["effect_type"] = EffectType.RECESSIVE case _: logger.critical(f"Bad effect type setting: {variant}") raise Exception @@ -127,37 +134,33 @@ def assign_effect_type(variants): def remap_harmonised(variants, harmonised: bool): - n_bad = 0 if harmonised: for variant in variants: - if variant["hm_chr"]: - variant["chr_name"] = variant["hm_chr"] - - if variant["hm_pos"]: - variant["chr_position"] = variant["hm_pos"] - - if "hm_inferOtherAllele" in variant and variant.get("other_allele") is None: + # using the harmonised field in the header to make sure we don't accidentally overwrite + # positions with empty data (e.g. in an unharmonised file) + # if harmonisation has failed we _always_ want to use that information + variant["chr_name"] = variant["hm_chr"] + variant["chr_position"] = variant["hm_pos"] + if variant["other_allele"] is None: variant["other_allele"] = variant["hm_inferOtherAllele"] - - if ( - "chr_name" in variant - and "chr_position" in variant - and "effect_weight" in variant - ): - yield variant - elif Config.drop_missing: - continue - # (don't yield anything, filtering out missing variants) - else: - # assume a bad harmonisation with no genomic coordinates - # these will get labelled as duplicates eventually (probably) - variant["chr_name"] = None - variant["chr_position"] = None - yield variant - n_bad += 1 + yield variant else: for variant in variants: + # can't remap, so don't try yield variant + +def check_bad_variant(variants): + n_bad = 0 + for variant in variants: + match variant: + case {"chr_name": None} | {"chr_position": None} | {"effect_allele": None}: + # (effect weight checked separately) + n_bad += 1 + if not Config.drop_missing: + yield variant + case _: + yield variant + if n_bad > 1: - logger.warning(f"{n_bad} variants failed harmonisation") + logger.warning(f"{n_bad} bad variants") diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index 2479801..e18ae00 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -10,6 +10,7 @@ from pgscatalog_utils.scorefile.config import Config from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open from pgscatalog_utils.scorefile.qc import quality_control +from pgscatalog_utils.scorefile.scorevariant import ScoreVariant logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -129,7 +130,9 @@ def read_variants(path, fields, start_line, name: str, is_wide: bool): row_nr += len(batch) -def read_rows(csv_reader, fields: list[str], name: str, wide: bool, row_nr: int): +def read_rows( + csv_reader, fields: list[str], name: str, wide: bool, row_nr: int +) -> typing.Generator[ScoreVariant, None, None]: for row in csv_reader: variant = dict(zip(fields, row)) @@ -138,42 +141,20 @@ def read_rows(csv_reader, fields: list[str], name: str, wide: bool, row_nr: int) i for i, x in enumerate(["effect_weight_" in x for x in fields]) if x ] for i, weight_name in zip(ew_col_idxs, [fields[i] for i in ew_col_idxs]): - keys = ["chr_name", "chr_position", "effect_allele", "other_allele"] - yield {k: variant[k] for k in keys if k in variant} | { - "accession": weight_name, - "row_nr": row_nr, - "effect_weight": variant[weight_name], - } + yield ScoreVariant( + **variant, + **{ + "accession": weight_name, + "row_nr": row_nr, + "effect_weight": variant[weight_name], + }, + ) else: - keys = [ - "chr_name", - "chr_position", - "effect_allele", - "other_allele", - "effect_weight", - "hm_chr", - "hm_pos", - "hm_inferOtherAllele", - "hm_source", - "is_dominant", - "is_recessive", - "accession", - "row_nr", - ] - - yield {k: variant[k] for k in keys if k in variant} | { - "accession": name, - "row_nr": row_nr, - } + yield ScoreVariant(**variant, **{"accession": name, "row_nr": row_nr}) row_nr += 1 -def parse_dict(variants): - # TODO: use best data types when parsing lines - pass - - def get_columns(path) -> tuple[int, list[str]]: open_function = auto_open(path) with open_function(path, mode="rt") as f: From ee16684aa0f508bb76c9d3f1af39019e2699e608 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 16 Nov 2023 16:48:50 +0000 Subject: [PATCH 22/40] add type hints --- pgscatalog_utils/scorefile/liftover.py | 9 ++++-- pgscatalog_utils/scorefile/qc.py | 38 +++++++++++++++++------ pgscatalog_utils/scorefile/scoringfile.py | 17 +++++----- 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py index 255fcf2..7ccd5a0 100644 --- a/pgscatalog_utils/scorefile/liftover.py +++ b/pgscatalog_utils/scorefile/liftover.py @@ -1,17 +1,22 @@ import logging import os +import typing import pyliftover from pgscatalog_utils.download.GenomeBuild import GenomeBuild from pgscatalog_utils.scorefile.config import Config +from pgscatalog_utils.scorefile.scorevariant import ScoreVariant logger = logging.getLogger(__name__) def liftover( - variants, harmonised: bool, current_build: GenomeBuild, target_build: GenomeBuild -): + variants: typing.Generator[ScoreVariant, None, None], + harmonised: bool, + current_build: GenomeBuild, + target_build: GenomeBuild, +) -> typing.Generator[ScoreVariant, None, None]: if harmonised: skip_lo = True elif target_build == current_build: diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index d98acee..75f50ba 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -5,11 +5,17 @@ from pgscatalog_utils.scorefile.effecttype import EffectType from pgscatalog_utils.scorefile.header import ScoringFileHeader from pgscatalog_utils.scorefile.liftover import liftover +from pgscatalog_utils.scorefile.scorevariant import ScoreVariant logger = logging.getLogger(__name__) -def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide: bool): +def quality_control( + variants: typing.Generator[ScoreVariant, None, None], + header: ScoringFileHeader, + harmonised: bool, + wide: bool, +) -> typing.Generator[ScoreVariant, None, None]: # order is important for: # 1. liftover non-harmonised data (quite rare), failed lifts get None'd # 2. remap harmonised data, failed harmonisations get None'd @@ -44,7 +50,9 @@ def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide: return variants -def check_duplicates(variants): +def check_duplicates( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: seen_ids: dict = {} current_accession: typing.Union[str, None] = None n_duplicates: int = 0 @@ -81,7 +89,9 @@ def check_duplicates(variants): ) -def drop_hla(variants): +def drop_hla( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: n_dropped = 0 for variant in variants: match variant: @@ -94,7 +104,9 @@ def drop_hla(variants): logger.warning(f"{n_dropped} HLA alleles detected and dropped") -def check_effect_weight(variants): +def check_effect_weight( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: for variant in variants: try: float(variant["effect_weight"]) @@ -104,7 +116,9 @@ def check_effect_weight(variants): raise ValueError -def assign_other_allele(variants): +def assign_other_allele( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: n_dropped = 0 for variant in variants: if "/" in variant["other_allele"]: @@ -118,11 +132,13 @@ def assign_other_allele(variants): logger.warning("Other allele for these variants is set to missing") -def assign_effect_type(variants): +def assign_effect_type( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: for variant in variants: match (variant.get("is_recessive"), variant.get("is_dominant")): case (None, None) | ("FALSE", "FALSE"): - pass # default value is additive + pass # default value is additive, pass to break match and yield case ("FALSE", "TRUE"): variant["effect_type"] = EffectType.DOMINANT case ("TRUE", "FALSE"): @@ -133,7 +149,9 @@ def assign_effect_type(variants): yield variant -def remap_harmonised(variants, harmonised: bool): +def remap_harmonised( + variants: typing.Generator[ScoreVariant, None, None], harmonised: bool +) -> typing.Generator[ScoreVariant, None, None]: if harmonised: for variant in variants: # using the harmonised field in the header to make sure we don't accidentally overwrite @@ -150,7 +168,9 @@ def remap_harmonised(variants, harmonised: bool): yield variant -def check_bad_variant(variants): +def check_bad_variant( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: n_bad = 0 for variant in variants: match variant: diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index e18ae00..976b969 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -12,7 +12,6 @@ from pgscatalog_utils.scorefile.qc import quality_control from pgscatalog_utils.scorefile.scorevariant import ScoreVariant -logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -24,7 +23,7 @@ class ScoringFile: genome_build: typing.Union[GenomeBuild, None] harmonised: bool fields: list[str] - variants: typing.Generator + variants: typing.Generator[ScoreVariant, None, None] def __post_init__(self): if self.header.HmPOS_build: @@ -57,13 +56,15 @@ def from_path(cls, path: pathlib.Path): is_wide = detect_wide(cols) logger.info(f"Lazily reading variants from {path}") - variants = ScoringFile.read_variants( + variants: typing.Generator[ + ScoreVariant, None, None + ] = ScoringFile.read_variants( path=path, start_line=start_line, fields=cols, name=name, is_wide=is_wide ) - # note: these generator expressions aren't doing a bunch of iterations + # note: the qc generators aren't doing a bunch of nested iterations # it's just a data processing pipeline - variants = quality_control( + variants: typing.Generator[ScoreVariant, None, None] = quality_control( variants, header=header, harmonised=harmonised, wide=is_wide ) @@ -110,7 +111,9 @@ def generate_log(self, counted: typing.Counter): return {self.accession: log} @staticmethod - def read_variants(path, fields, start_line, name: str, is_wide: bool): + def read_variants( + path, fields, start_line, name: str, is_wide: bool + ) -> typing.Generator[ScoreVariant, None, None]: open_function = auto_open(path) row_nr = 0 @@ -126,7 +129,7 @@ def read_variants(path, fields, start_line, name: str, is_wide: bool): csv_reader = csv.reader(batch, delimiter="\t") yield from read_rows(csv_reader, fields, name, is_wide, row_nr) - # this is important for row_nr resets for each batch + # this is important because row_nr resets for each batch row_nr += len(batch) From 6be1dd3571c37d71e3910486aca1b1c9ced1b492 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 16 Nov 2023 17:05:37 +0000 Subject: [PATCH 23/40] remove coordinates from mandatory fields --- pgscatalog_utils/scorefile/scorevariant.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py index 3b258a6..5bbc307 100644 --- a/pgscatalog_utils/scorefile/scorevariant.py +++ b/pgscatalog_utils/scorefile/scorevariant.py @@ -35,14 +35,14 @@ class ScoreVariant(collections.UserDict): """ mandatory_fields: tuple[str] = ( - "chr_name", - "chr_position", "effect_allele", "effect_weight", "accession", "row_nr", ) optional_fields: tuple[str] = ( + "chr_name", + "chr_position", "rsID", "other_allele", "hm_chr", @@ -64,6 +64,20 @@ def __init__(self, **kwargs): if field not in self.data: raise ValueError(f"Mandatory field '{field}' is missing.") + # note on coordinates / rsID not being mandatory + # ---------------------------------------------- + # according to PGS Catalog scoring file standards: + # - rsID is mandatory if genomic coordinates are missing + # - genomic coordinates are mandatory if rsIDs are missing + # however I want to keep __init__ as simple (and fast) as possible + # millions of ScoreVariants may be instantiated + # so don't check, just initialise to None if missing + + # practically speaking: + # 1) harmonised files may be missing coordinates, but have hm columns which we then use + # 2) we loudly warn about variants that are missing coordinates + # 3) custom scorefiles are expected to supply coordinates + # set most optional fields to None... for field in self.optional_fields: self.data.setdefault(field, None) From ea16f4e8f74fb5dd6f25100d97ae1192188cab92 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 20 Nov 2023 10:04:57 +0000 Subject: [PATCH 24/40] fix old scoring files --- pgscatalog_utils/download/GenomeBuild.py | 10 +++++++--- pgscatalog_utils/scorefile/qc.py | 2 +- pgscatalog_utils/scorefile/scoringfile.py | 2 +- .../scorefile/{header.py => scoringfileheader.py} | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) rename pgscatalog_utils/scorefile/{header.py => scoringfileheader.py} (96%) diff --git a/pgscatalog_utils/download/GenomeBuild.py b/pgscatalog_utils/download/GenomeBuild.py index 23c8984..69fd8ab 100644 --- a/pgscatalog_utils/download/GenomeBuild.py +++ b/pgscatalog_utils/download/GenomeBuild.py @@ -4,6 +4,8 @@ class GenomeBuild(Enum): GRCh37 = "GRCh37" GRCh38 = "GRCh38" + # just included to handle older files, incompatible unless harmonised: + NCBI36 = "NCBI36" # ew def __str__(self): return str(self.value) @@ -11,11 +13,13 @@ def __str__(self): @classmethod def from_string(cls, build): match build: - case "GRCh37" | "hg18": + case "GRCh37" | "hg19": return cls(GenomeBuild.GRCh37) - case "GRCh38" | "hg19": + case "GRCh38" | "hg38": return cls(GenomeBuild.GRCh38) case "NR": return None + case "NCBI36" | "hg18": + return cls(GenomeBuild.NCBI36) case _: - raise Exception + raise Exception(f"Can't match {build=}") diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 75f50ba..5282bdd 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -3,7 +3,7 @@ from pgscatalog_utils.scorefile.config import Config from pgscatalog_utils.scorefile.effecttype import EffectType -from pgscatalog_utils.scorefile.header import ScoringFileHeader +from pgscatalog_utils.scorefile.scoringfileheader import ScoringFileHeader from pgscatalog_utils.scorefile.liftover import liftover from pgscatalog_utils.scorefile.scorevariant import ScoreVariant diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index 976b969..95dff01 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -8,7 +8,7 @@ from pgscatalog_utils.download.GenomeBuild import GenomeBuild from pgscatalog_utils.scorefile.config import Config -from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open +from pgscatalog_utils.scorefile.scoringfileheader import ScoringFileHeader, auto_open from pgscatalog_utils.scorefile.qc import quality_control from pgscatalog_utils.scorefile.scorevariant import ScoreVariant diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/scoringfileheader.py similarity index 96% rename from pgscatalog_utils/scorefile/header.py rename to pgscatalog_utils/scorefile/scoringfileheader.py index 82ea79d..07cf663 100644 --- a/pgscatalog_utils/scorefile/header.py +++ b/pgscatalog_utils/scorefile/scoringfileheader.py @@ -45,7 +45,7 @@ def from_path(cls, path: pathlib.Path): return ScoringFileHeader(**header_dict) else: # no header available - raise Exception("No header detected in scoring file") + raise Exception(f"No header detected in scoring file {path=}") def raw_header_to_dict(header): From 7c4c84843d98d1d38a611e3b8b395b9b6fa74b88 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 20 Nov 2023 17:59:43 +0000 Subject: [PATCH 25/40] check effect alleles and compelx scoring files --- pgscatalog_utils/scorefile/effectallele.py | 12 ++++++++++ pgscatalog_utils/scorefile/qc.py | 18 +++++++++++++++ pgscatalog_utils/scorefile/scoringfile.py | 3 ++- pgscatalog_utils/scorefile/write.py | 26 ++++++++++++++++++++-- 4 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 pgscatalog_utils/scorefile/effectallele.py diff --git a/pgscatalog_utils/scorefile/effectallele.py b/pgscatalog_utils/scorefile/effectallele.py new file mode 100644 index 0000000..92dafd9 --- /dev/null +++ b/pgscatalog_utils/scorefile/effectallele.py @@ -0,0 +1,12 @@ +import logging + +logger = logging.getLogger(__name__) + + +class EffectAllele: + # (class attribute, so shared) + _valid_bases = frozenset({"A", "C", "T", "G"}) + + @classmethod + def is_valid(cls, effect_allele: str) -> bool: + return not frozenset(effect_allele) - cls._valid_bases diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 5282bdd..d014aa0 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -1,6 +1,8 @@ import logging import typing +from pgscatalog_utils.scorefile.effectallele import EffectAllele + from pgscatalog_utils.scorefile.config import Config from pgscatalog_utils.scorefile.effecttype import EffectType from pgscatalog_utils.scorefile.scoringfileheader import ScoringFileHeader @@ -22,6 +24,7 @@ def quality_control( # 3. check and optionally drop bad variants # where a bad variant has None in a mandatory ScoreVariant field # then continue with other QC + logger.info(f"Starting quality control checks for {header.pgs_id=}") if Config.liftover: variants = liftover( @@ -40,6 +43,7 @@ def quality_control( variants = assign_effect_type(variants) variants = check_effect_weight(variants) variants = assign_other_allele(variants) + variants = check_effect_allele(variants) if wide: # wide data must be sorted because check_duplicates requires sorted input @@ -184,3 +188,17 @@ def check_bad_variant( if n_bad > 1: logger.warning(f"{n_bad} bad variants") + + +def check_effect_allele( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: + n_bad = 0 + for variant in variants: + if not EffectAllele.is_valid(variant["effect_allele"]): + n_bad += 1 + + yield variant + + if n_bad > 1: + logger.warning(f"{n_bad} variants have invalid effect alleles (not ACTG)") diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index 95dff01..15267d5 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -91,9 +91,10 @@ def generate_log(self, counted: typing.Counter): if ( int(log["variants_number"]) != counted["n_variants"] and not Config.drop_missing + and counted.get("complex", 0) == 0 ): raise Exception( - f"Mismatch between variants_number and counted output {self.accession}" + f"Mismatch between header ({log['variants_number']}) and counted output ({counted['n_variants']}) for {self.accession}" ) # multiple terms may be separated with a pipe diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 57ceb91..db8fd31 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -9,6 +9,8 @@ from itertools import islice from pgscatalog_utils.scorefile.config import Config +from pgscatalog_utils.scorefile.effectallele import EffectAllele +from pgscatalog_utils.scorefile.scorevariant import ScoreVariant from pgscatalog_utils.scorefile.scoringfile import ScoringFile logger = logging.getLogger(__name__) @@ -112,9 +114,29 @@ def write_combined( return log -def calculate_log(batch, log: list[Counter]) -> list[Counter]: +def calculate_log(batch: list[ScoreVariant], log: list[Counter]) -> list[Counter]: # these statistics can only be generated while iterating through variants n_variants = Counter("n_variants" for item in batch) + complex_scorefile = Counter(detect_complex(batch)) hm_source = Counter(item["hm_source"] for item in batch if "hm_source" in item) - log.extend([n_variants, hm_source]) + log.extend([n_variants + hm_source + complex_scorefile]) return log + + +def detect_complex(batch: list[ScoreVariant]) -> typing.Generator[str, None, None]: + """Some older scoring files in the PGS Catalog are complicated + We agreed to skip some checks on these odd files and just reproduce them faithfully + They often require bespoke set up to support interaction terms, etc + """ + complex_keys = {"is_haplotype", "is_diplotype", "is_interaction"} + + for key in complex_keys: + for variant in batch: + if not EffectAllele.is_valid(variant["effect_allele"]): + yield "complex" + + if variant.get(key, False) == "True": + # explicitly check string value with == because + # a scoring file with a column with all false values is valid + # (i.e. don't just check key presence) + yield "complex" From 51aa0f219a10b90569552e6784750d78b47ce31a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 21 Nov 2023 11:48:57 +0000 Subject: [PATCH 26/40] don't access __annotations__ directly --- pgscatalog_utils/scorefile/scoringfileheader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pgscatalog_utils/scorefile/scoringfileheader.py b/pgscatalog_utils/scorefile/scoringfileheader.py index 07cf663..a06622a 100644 --- a/pgscatalog_utils/scorefile/scoringfileheader.py +++ b/pgscatalog_utils/scorefile/scoringfileheader.py @@ -1,4 +1,5 @@ import gzip +import inspect import pathlib from dataclasses import dataclass @@ -33,7 +34,7 @@ def __post_init__(self): def from_path(cls, path: pathlib.Path): raw_header: dict = raw_header_to_dict(read_header(path)) # only keep keys needed by class but support partial headers with None values - keep_keys = ScoringFileHeader.__annotations__.keys() + keep_keys = inspect.get_annotations(ScoringFileHeader).keys() header_dict = {k: raw_header.get(k) for k in keep_keys} # ... so we can unpack the dict into a dataclass From 9938bbdfaa052644c24057e764c96652a6453ad2 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 21 Nov 2023 11:49:09 +0000 Subject: [PATCH 27/40] remove logger --- pgscatalog_utils/scorefile/effectallele.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pgscatalog_utils/scorefile/effectallele.py b/pgscatalog_utils/scorefile/effectallele.py index 92dafd9..14412a3 100644 --- a/pgscatalog_utils/scorefile/effectallele.py +++ b/pgscatalog_utils/scorefile/effectallele.py @@ -1,8 +1,3 @@ -import logging - -logger = logging.getLogger(__name__) - - class EffectAllele: # (class attribute, so shared) _valid_bases = frozenset({"A", "C", "T", "G"}) From af1eef223c0440e00d2e69aa21cdbcd546a8546a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 22 Nov 2023 11:27:28 +0000 Subject: [PATCH 28/40] warn about complex files and variant mismatch --- pgscatalog_utils/scorefile/qc.py | 25 +++++++++++++++++++++-- pgscatalog_utils/scorefile/scoringfile.py | 8 +++++--- pgscatalog_utils/scorefile/write.py | 23 +-------------------- 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index d014aa0..a99ed40 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -24,8 +24,6 @@ def quality_control( # 3. check and optionally drop bad variants # where a bad variant has None in a mandatory ScoreVariant field # then continue with other QC - logger.info(f"Starting quality control checks for {header.pgs_id=}") - if Config.liftover: variants = liftover( variants, @@ -44,6 +42,7 @@ def quality_control( variants = check_effect_weight(variants) variants = assign_other_allele(variants) variants = check_effect_allele(variants) + variants = detect_complex(variants) if wide: # wide data must be sorted because check_duplicates requires sorted input @@ -202,3 +201,25 @@ def check_effect_allele( if n_bad > 1: logger.warning(f"{n_bad} variants have invalid effect alleles (not ACTG)") + + +def detect_complex( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: + """Some older scoring files in the PGS Catalog are complicated. + They often require bespoke set up to support interaction terms, etc + """ + complex_keys = {"is_haplotype", "is_diplotype", "is_interaction"} + is_complex = False + + for variant in variants: + if not is_complex: + is_complex = any(key in variant for key in complex_keys) + + yield variant + + if is_complex: + logger.warning("Complex scoring file detected") + logger.warning( + "Complex files are difficult to calculate properly and may require manual intervention" + ) diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py index 15267d5..ca02b28 100644 --- a/pgscatalog_utils/scorefile/scoringfile.py +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -91,10 +91,12 @@ def generate_log(self, counted: typing.Counter): if ( int(log["variants_number"]) != counted["n_variants"] and not Config.drop_missing - and counted.get("complex", 0) == 0 ): - raise Exception( - f"Mismatch between header ({log['variants_number']}) and counted output ({counted['n_variants']}) for {self.accession}" + logger.warning( + f"Mismatch between header ({log['variants_number']}) and output row count ({counted['n_variants']}) for {self.accession}" + ) + logger.warning( + "This can happen with older scoring files in the PGS Catalog (e.g. PGS000028)" ) # multiple terms may be separated with a pipe diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index db8fd31..efe158f 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -9,7 +9,6 @@ from itertools import islice from pgscatalog_utils.scorefile.config import Config -from pgscatalog_utils.scorefile.effectallele import EffectAllele from pgscatalog_utils.scorefile.scorevariant import ScoreVariant from pgscatalog_utils.scorefile.scoringfile import ScoringFile @@ -117,26 +116,6 @@ def write_combined( def calculate_log(batch: list[ScoreVariant], log: list[Counter]) -> list[Counter]: # these statistics can only be generated while iterating through variants n_variants = Counter("n_variants" for item in batch) - complex_scorefile = Counter(detect_complex(batch)) hm_source = Counter(item["hm_source"] for item in batch if "hm_source" in item) - log.extend([n_variants + hm_source + complex_scorefile]) + log.extend([n_variants + hm_source]) return log - - -def detect_complex(batch: list[ScoreVariant]) -> typing.Generator[str, None, None]: - """Some older scoring files in the PGS Catalog are complicated - We agreed to skip some checks on these odd files and just reproduce them faithfully - They often require bespoke set up to support interaction terms, etc - """ - complex_keys = {"is_haplotype", "is_diplotype", "is_interaction"} - - for key in complex_keys: - for variant in batch: - if not EffectAllele.is_valid(variant["effect_allele"]): - yield "complex" - - if variant.get(key, False) == "True": - # explicitly check string value with == because - # a scoring file with a column with all false values is valid - # (i.e. don't just check key presence) - yield "complex" From 54fd6eab2ac6d86dc72524a88bcc6cd7a59bb37b Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 6 Dec 2023 11:56:45 +0000 Subject: [PATCH 29/40] refactor scorevariant from userdict to class with __slots__ --- pgscatalog_utils/scorefile/effectallele.py | 17 ++- pgscatalog_utils/scorefile/effecttype.py | 4 + pgscatalog_utils/scorefile/qc.py | 47 +++---- pgscatalog_utils/scorefile/scorevariant.py | 150 +++++++++++++-------- pgscatalog_utils/scorefile/write.py | 9 +- 5 files changed, 137 insertions(+), 90 deletions(-) diff --git a/pgscatalog_utils/scorefile/effectallele.py b/pgscatalog_utils/scorefile/effectallele.py index 14412a3..6f0dfcb 100644 --- a/pgscatalog_utils/scorefile/effectallele.py +++ b/pgscatalog_utils/scorefile/effectallele.py @@ -1,7 +1,16 @@ class EffectAllele: - # (class attribute, so shared) _valid_bases = frozenset({"A", "C", "T", "G"}) + __slots__ = ("allele", "is_valid") - @classmethod - def is_valid(cls, effect_allele: str) -> bool: - return not frozenset(effect_allele) - cls._valid_bases + def __init__(self, allele: str): + self.allele = allele + self.is_valid = self.is_valid_allele() + + def __repr__(self): + return f'{type(self).__name__}("{self.allele}")' + + def __str__(self): + return self.allele + + def is_valid_allele(self) -> bool: + return not frozenset(self.allele) - self._valid_bases diff --git a/pgscatalog_utils/scorefile/effecttype.py b/pgscatalog_utils/scorefile/effecttype.py index 0d51f14..4878072 100644 --- a/pgscatalog_utils/scorefile/effecttype.py +++ b/pgscatalog_utils/scorefile/effecttype.py @@ -8,3 +8,7 @@ class EffectType(Enum): def __str__(self): return str(self.value) + + def __repr__(self): + # pasting __repr__ output should be sufficient to construct the class + return f"{type(self).__name__}.{self.name}" diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index a99ed40..50fcb52 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -1,7 +1,6 @@ import logging import typing -from pgscatalog_utils.scorefile.effectallele import EffectAllele from pgscatalog_utils.scorefile.config import Config from pgscatalog_utils.scorefile.effecttype import EffectType @@ -61,27 +60,25 @@ def check_duplicates( n_duplicates: int = 0 n_variants: int = 0 for variant in variants: - accession: str = variant["accession"] + accession: str = variant.accession if accession != current_accession: seen_ids = {} current_accession = accession # None other allele -> empty string - id: str = ":".join( + variant_id: str = ":".join( [ - str(variant[k] or "") + str(getattr(variant, k) or "") for k in ["chr_name", "chr_position", "effect_allele", "other_allele"] ] ) - if id in seen_ids: - variant["is_duplicated"] = True + if variant_id in seen_ids: + variant.is_duplicated = True n_duplicates += 1 - else: - variant["is_duplicated"] = False - seen_ids[id] = True + seen_ids[variant_id] = True yield variant n_variants += 1 @@ -112,7 +109,7 @@ def check_effect_weight( ) -> typing.Generator[ScoreVariant, None, None]: for variant in variants: try: - float(variant["effect_weight"]) + float(variant.effect_weight) yield variant except ValueError: logger.critical(f"{variant} has bad effect weight") @@ -124,9 +121,9 @@ def assign_other_allele( ) -> typing.Generator[ScoreVariant, None, None]: n_dropped = 0 for variant in variants: - if "/" in variant["other_allele"]: + if "/" in variant.other_allele: n_dropped += 1 - variant["other_allele"] = None + variant.other_allele = None yield variant @@ -139,13 +136,13 @@ def assign_effect_type( variants: typing.Generator[ScoreVariant, None, None] ) -> typing.Generator[ScoreVariant, None, None]: for variant in variants: - match (variant.get("is_recessive"), variant.get("is_dominant")): + match (variant.is_recessive, variant.is_dominant): case (None, None) | ("FALSE", "FALSE"): pass # default value is additive, pass to break match and yield case ("FALSE", "TRUE"): - variant["effect_type"] = EffectType.DOMINANT + variant.effect_type = EffectType.DOMINANT case ("TRUE", "FALSE"): - variant["effect_type"] = EffectType.RECESSIVE + variant.effect_type = EffectType.RECESSIVE case _: logger.critical(f"Bad effect type setting: {variant}") raise Exception @@ -160,10 +157,10 @@ def remap_harmonised( # using the harmonised field in the header to make sure we don't accidentally overwrite # positions with empty data (e.g. in an unharmonised file) # if harmonisation has failed we _always_ want to use that information - variant["chr_name"] = variant["hm_chr"] - variant["chr_position"] = variant["hm_pos"] - if variant["other_allele"] is None: - variant["other_allele"] = variant["hm_inferOtherAllele"] + variant.chr_name = variant.hm_chr + variant.chr_position = variant.hm_pos + if variant.other_allele is None: + variant.other_allele = variant.hm_inferOtherAllele yield variant else: for variant in variants: @@ -177,7 +174,11 @@ def check_bad_variant( n_bad = 0 for variant in variants: match variant: - case {"chr_name": None} | {"chr_position": None} | {"effect_allele": None}: + case ( + ScoreVariant(chr_name=None) + | ScoreVariant(chr_position=None) + | ScoreVariant(effect_allele=None) + ): # (effect weight checked separately) n_bad += 1 if not Config.drop_missing: @@ -194,7 +195,7 @@ def check_effect_allele( ) -> typing.Generator[ScoreVariant, None, None]: n_bad = 0 for variant in variants: - if not EffectAllele.is_valid(variant["effect_allele"]): + if not variant.effect_allele.is_valid: n_bad += 1 yield variant @@ -209,12 +210,12 @@ def detect_complex( """Some older scoring files in the PGS Catalog are complicated. They often require bespoke set up to support interaction terms, etc """ - complex_keys = {"is_haplotype", "is_diplotype", "is_interaction"} is_complex = False for variant in variants: if not is_complex: - is_complex = any(key in variant for key in complex_keys) + if variant.is_complex: + is_complex = True yield variant diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py index 5bbc307..094c6d9 100644 --- a/pgscatalog_utils/scorefile/scorevariant.py +++ b/pgscatalog_utils/scorefile/scorevariant.py @@ -1,39 +1,8 @@ -""" -This module contains the class ScoreVariant, which is a custom dictionary used to consistently represent rows in a PGS Catalog scoring file -""" -import collections - +from pgscatalog_utils.scorefile.effectallele import EffectAllele from pgscatalog_utils.scorefile.effecttype import EffectType -class ScoreVariant(collections.UserDict): - """A single variant from a scoring file structured to follow PGS Catalog standards, - typically extracted from a row in a scoring file. - - See https://www.pgscatalog.org/downloads/#dl_scoring_files for field descriptions. - - This class is intentionally simple (a dict that checks for mandatory keys and fills - optional keys) because a more complicated __init__ will be slow when lots of variants - are read from a file. dicts use fast C magic, so try not to interfere too much. - - Some additional keys are included for quality control: - - accession: a unique identifier to group variants in the same score) - - row_nr: an incrementing integer, used to track the number of variants in an accession - - is_duplicated: a label to mark variants with the same coordinates and alleles - - effect_type: additive, recessive, or dominant - - >>> variant = ScoreVariant(**{"chr_name": "1", "chr_position": 1, "effect_allele": "A", "other_allele": "G", "effect_weight": 0.5, "accession": "PGS000822", "row_nr": 0}) - >>> variant - {'chr_name': '1', 'chr_position': 1, 'effect_allele': 'A', 'other_allele': 'G', 'effect_weight': 0.5, 'accession': 'PGS000822', 'row_nr': 0, 'rsID': None, 'hm_chr': None, 'hm_pos': None, 'hm_inferOtherAllele': None, 'hm_source': None, 'is_dominant': None, 'is_recessive': None, 'hm_rsID': None, 'hm_match_chr': None, 'hm_match_pos': None, 'is_duplicated': None, 'effect_type': } - - Mandatory data fields match PGS Catalog harmonised data standards: - - >>> ScoreVariant(**{"chr_name": "1", "chr_position": 1}) - Traceback (most recent call last): - ... - ValueError: Mandatory field 'effect_allele' is missing. - """ - +class ScoreVariant: mandatory_fields: tuple[str] = ( "effect_allele", "effect_weight", @@ -55,32 +24,97 @@ class ScoreVariant(collections.UserDict): "hm_match_chr", "hm_match_pos", "is_duplicated", + "effect_type", ) - def __init__(self, **kwargs): - super().__init__(**kwargs) # creates the dict - - for field in self.mandatory_fields: - if field not in self.data: - raise ValueError(f"Mandatory field '{field}' is missing.") - - # note on coordinates / rsID not being mandatory - # ---------------------------------------------- - # according to PGS Catalog scoring file standards: - # - rsID is mandatory if genomic coordinates are missing - # - genomic coordinates are mandatory if rsIDs are missing - # however I want to keep __init__ as simple (and fast) as possible - # millions of ScoreVariants may be instantiated - # so don't check, just initialise to None if missing + complex_fields: tuple[str] = ("is_haplotype", "is_diplotype", "is_interaction") - # practically speaking: - # 1) harmonised files may be missing coordinates, but have hm columns which we then use - # 2) we loudly warn about variants that are missing coordinates - # 3) custom scorefiles are expected to supply coordinates - - # set most optional fields to None... - for field in self.optional_fields: - self.data.setdefault(field, None) + # column names for output are used by __iter__ and when writing out + output_fields: tuple[str] = ( + "chr_name", + "chr_position", + "effect_allele", + "other_allele", + "effect_weight", + "effect_type", + "is_duplicated", + "accession", + "row_nr", + ) - # ... except effect type, as the vast majority of variants are additive - self.data.setdefault("effect_type", EffectType.ADDITIVE) + # slots uses magic to improve speed and memory when making millions of objects + __slots__ = mandatory_fields + optional_fields + ("is_complex",) + + # __init__ is intentionally verbose and avoids using loops or trickery to work: + # - attributes won't change often + # - class accepts keyword parameters only to init (not positional) + # - type hints are helpful in parameters + # - setting sensible defaults for optional fields is clear + # - being verbose helps prevent IDE warnings + # extra kwargs are silently ignored + # (yes, effect_weight is treated as a str, want to avoid rounding errors at this stage) + def __init__( + self, + *, + effect_allele: str, + effect_weight: str, + accession: str, + row_nr: int, + chr_name: str = None, + chr_position: int = None, + rsID: str = None, + other_allele: str = None, + hm_chr: str = None, + hm_pos: int = None, + hm_inferOtherAllele: str = None, + hm_source: str = None, + is_dominant: str = None, + is_recessive: str = None, + hm_rsID: str = None, + hm_match_chr: str = None, + hm_match_pos: str = None, + is_duplicated: bool = False, + effect_type: EffectType = EffectType.ADDITIVE, + is_complex: bool = False, + **kwargs, + ): + # start with mandatory attributes + self.effect_allele: EffectAllele = EffectAllele(effect_allele) + self.effect_weight: str = effect_weight + self.accession = accession + self.row_nr = row_nr + + # now set optional fields + self.chr_name = chr_name + self.chr_position = chr_position + self.rsID = rsID + self.other_allele = other_allele + self.hm_chr = hm_chr + self.hm_pos = hm_pos + self.hm_inferOtherAllele = hm_inferOtherAllele + self.hm_source = hm_source + self.is_dominant = is_dominant + self.is_recessive = is_recessive + self.hm_rsID = hm_rsID + self.hm_match_chr = hm_match_chr + self.hm_match_pos = hm_match_pos + self.is_duplicated = is_duplicated + self.effect_type = effect_type + + # these fields are important to check if variants are complex + if any([x in kwargs for x in self.complex_fields]): + is_complex = True + self.is_complex = is_complex + + def __repr__(self): + class_name = type(self).__name__ + values = {} + + for key in ScoreVariant.__slots__: + values[key] = getattr(self, key, None) + + return f"{class_name}({values})" + + def __iter__(self): + for attr in self.output_fields: + yield getattr(self, attr) diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index efe158f..9424fbc 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -50,15 +50,14 @@ def __init__(self, compress, filename): def write(self, batch): mode = "at" if os.path.exists(self.filename) else "wt" with self.open_function(self.filename, mode) as f: - writer = csv.DictWriter( + writer = csv.writer( f, - fieldnames=self.fieldnames, delimiter="\t", - extrasaction="ignore", lineterminator="\n", ) if mode == "wt": - writer.writeheader() + writer.writerow(ScoreVariant.output_fields) + writer.writerows(batch) @@ -116,6 +115,6 @@ def write_combined( def calculate_log(batch: list[ScoreVariant], log: list[Counter]) -> list[Counter]: # these statistics can only be generated while iterating through variants n_variants = Counter("n_variants" for item in batch) - hm_source = Counter(item["hm_source"] for item in batch if "hm_source" in item) + hm_source = Counter(getattr(item, "hm_source") for item in batch) log.extend([n_variants + hm_source]) return log From 42a580fbc3815de3de8eef2f22a16e188e332b22 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 7 Dec 2023 17:19:23 +0000 Subject: [PATCH 30/40] fix __repr__ and type hints --- pgscatalog_utils/scorefile/scorevariant.py | 45 ++++++++++++---------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py index 094c6d9..0d367df 100644 --- a/pgscatalog_utils/scorefile/scorevariant.py +++ b/pgscatalog_utils/scorefile/scorevariant.py @@ -1,3 +1,5 @@ +from typing import Optional + from pgscatalog_utils.scorefile.effectallele import EffectAllele from pgscatalog_utils.scorefile.effecttype import EffectType @@ -26,7 +28,6 @@ class ScoreVariant: "is_duplicated", "effect_type", ) - complex_fields: tuple[str] = ("is_haplotype", "is_diplotype", "is_interaction") # column names for output are used by __iter__ and when writing out @@ -81,30 +82,30 @@ def __init__( # start with mandatory attributes self.effect_allele: EffectAllele = EffectAllele(effect_allele) self.effect_weight: str = effect_weight - self.accession = accession - self.row_nr = row_nr + self.accession: str = accession + self.row_nr: int = int(row_nr) # now set optional fields - self.chr_name = chr_name - self.chr_position = chr_position - self.rsID = rsID - self.other_allele = other_allele - self.hm_chr = hm_chr - self.hm_pos = hm_pos - self.hm_inferOtherAllele = hm_inferOtherAllele - self.hm_source = hm_source - self.is_dominant = is_dominant - self.is_recessive = is_recessive - self.hm_rsID = hm_rsID - self.hm_match_chr = hm_match_chr - self.hm_match_pos = hm_match_pos - self.is_duplicated = is_duplicated - self.effect_type = effect_type + self.chr_name: Optional[str] = chr_name + self.chr_position: Optional[str] = chr_position + self.rsID: Optional[str] = rsID + self.other_allele: Optional[str] = other_allele + self.hm_chr: Optional[str] = hm_chr + self.hm_pos: Optional[int] = hm_pos + self.hm_inferOtherAllele: Optional[str] = hm_inferOtherAllele + self.hm_source: Optional[str] = hm_source + self.is_dominant: Optional[bool] = is_dominant + self.is_recessive: Optional[bool] = is_recessive + self.hm_rsID: Optional[str] = hm_rsID + self.hm_match_chr: Optional[str] = hm_match_chr + self.hm_match_pos: Optional[str] = hm_match_pos + self.is_duplicated: Optional[bool] = is_duplicated + self.effect_type: EffectType = effect_type # these fields are important to check if variants are complex if any([x in kwargs for x in self.complex_fields]): is_complex = True - self.is_complex = is_complex + self.is_complex: bool = is_complex def __repr__(self): class_name = type(self).__name__ @@ -113,7 +114,11 @@ def __repr__(self): for key in ScoreVariant.__slots__: values[key] = getattr(self, key, None) - return f"{class_name}({values})" + # extract str parameter for effect allele + values["effect_allele"] = values["effect_allele"].allele + + params = ",".join([f"{k}={repr(v)}" for k, v in values.items()]) + return f"{class_name}({params})" def __iter__(self): for attr in self.output_fields: From 3c0444597259a19e91e708c4a185d3d5c2e2883b Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 11 Dec 2023 15:31:38 +0000 Subject: [PATCH 31/40] add pyarrow support --- .../scorefile/combine_scorefiles.py | 2 +- pgscatalog_utils/scorefile/scorevariant.py | 16 +++- pgscatalog_utils/scorefile/write.py | 79 +++++++++++++++++-- poetry.lock | 50 +++++++++++- pyproject.toml | 1 + 5 files changed, 136 insertions(+), 12 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 2532f8b..bce5565 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -19,7 +19,7 @@ def combine_scorefiles(): logger = logging.getLogger(__name__) set_logging_level(args.verbose) - Config.batch_size = 20000 + Config.batch_size = 100000 Config.drop_missing = args.drop_missing Config.target_build = GenomeBuild.from_string(args.target_build) Config.liftover = args.liftover diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py index 0d367df..38135dc 100644 --- a/pgscatalog_utils/scorefile/scorevariant.py +++ b/pgscatalog_utils/scorefile/scorevariant.py @@ -87,11 +87,23 @@ def __init__( # now set optional fields self.chr_name: Optional[str] = chr_name - self.chr_position: Optional[str] = chr_position + + # casting to int is important for arrow export + try: + self.chr_position: Optional[int] = int(chr_position) + except (ValueError, TypeError): + self.chr_position = None + self.rsID: Optional[str] = rsID self.other_allele: Optional[str] = other_allele self.hm_chr: Optional[str] = hm_chr - self.hm_pos: Optional[int] = hm_pos + + # casting to int is important when harmonised data may replace chr_position + try: + self.hm_pos: Optional[int] = int(hm_pos) + except (ValueError, TypeError): + self.hm_pos = None + self.hm_inferOtherAllele: Optional[str] = hm_inferOtherAllele self.hm_source: Optional[str] = hm_source self.is_dominant: Optional[bool] = is_dominant diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 9424fbc..1e43594 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -12,6 +12,13 @@ from pgscatalog_utils.scorefile.scorevariant import ScoreVariant from pgscatalog_utils.scorefile.scoringfile import ScoringFile +try: + import pyarrow as pa + + PYARROW_AVAILABLE = True +except ImportError: + PYARROW_AVAILABLE = False + logger = logging.getLogger(__name__) @@ -82,18 +89,74 @@ def write(self, batch): conn.close() +class PyarrowWriter(DataWriter): + if PYARROW_AVAILABLE: + schema = pa.schema( + [ + pa.field("chr_name", pa.string()), + pa.field("chr_position", pa.uint64()), + pa.field("effect_allele", pa.string()), + pa.field("other_allele", pa.string()), + pa.field("effect_weight", pa.string()), + pa.field("effect_type", pa.string()), + pa.field("is_duplicated", pa.bool_()), + pa.field("accession", pa.string()), + pa.field("row_nr", pa.uint64()), + ] + ) + + def __init__(self, filename): + if not PYARROW_AVAILABLE: + # TODO: provide a pip command + raise ImportError( + "pyarrow output not available, please install pyarrow as listed in the pyproject.toml extras section" + ) + super().__init__(filename) + + self._sink = pa.OSFile(self.filename, "wb") + self._writer: pa.RecordBatchFileWriter = pa.ipc.new_file( + self._sink, self.schema + ) + + def write(self, batch: list[ScoreVariant]): + batch_dict = { + "chr_name": [x.chr_name for x in batch], + "chr_position": [x.chr_position for x in batch], + "effect_allele": [str(x.effect_allele) for x in batch], + "other_allele": [x.other_allele for x in batch], + "effect_weight": [x.effect_weight for x in batch], + "effect_type": [str(x.effect_type) for x in batch], + "is_duplicated": [x.is_duplicated for x in batch], + "accession": [x.accession for x in batch], + "row_nr": [x.row_nr for x in batch], + } + + record_batch = pa.RecordBatch.from_pydict(batch_dict, schema=self.schema) + self._writer.write(record_batch) + + def __del__(self): + # it's very important to close the writer and file, or it gets corrupted + # can't use a with statement, so close when the object gets deleted + self._writer.close() + if not self._sink.closed: + self._sink.close() + + def write_combined( scoring_files: list[ScoringFile], out_path: str ) -> dict[str : typing.Counter]: # compresslevel can be really slow, default is 9 - if out_path.endswith("gz"): - writer = TextFileWriter(compress=True, filename=out_path) - elif out_path.endswith("txt"): - writer = TextFileWriter(compress=False, filename=out_path) - elif out_path.endswith(".sqlite"): - writer = SqliteWriter(filename=out_path) - else: - raise Exception("Can't configure writer, please check out_path") + match fn := out_path.lower(): + case _ if fn.endswith("gz"): + writer = TextFileWriter(compress=True, filename=out_path) + case _ if fn.endswith("txt"): + writer = TextFileWriter(compress=False, filename=out_path) + case _ if fn.endswith("sqlite"): + writer = SqliteWriter(filename=out_path) + case _ if fn.endswith("ipc"): + writer = PyarrowWriter(filename=out_path) + case _: + raise ValueError(f"Unsupported file extension: {out_path}") counts = [] log = {} diff --git a/poetry.lock b/poetry.lock index 4c1fda4..05b2c77 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2245,6 +2245,54 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "pyarrow" +version = "14.0.1" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:96d64e5ba7dceb519a955e5eeb5c9adcfd63f73a56aea4722e2cc81364fc567a"}, + {file = "pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a8ae88c0038d1bc362a682320112ee6774f006134cd5afc291591ee4bc06505"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f6f053cb66dc24091f5511e5920e45c83107f954a21032feadc7b9e3a8e7851"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:906b0dc25f2be12e95975722f1e60e162437023f490dbd80d0deb7375baf3171"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:78d4a77a46a7de9388b653af1c4ce539350726cd9af62e0831e4f2bd0c95a2f4"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:06ca79080ef89d6529bb8e5074d4b4f6086143b2520494fcb7cf8a99079cde93"}, + {file = "pyarrow-14.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:32542164d905002c42dff896efdac79b3bdd7291b1b74aa292fac8450d0e4dcd"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:c7331b4ed3401b7ee56f22c980608cf273f0380f77d0f73dd3c185f78f5a6220"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:922e8b49b88da8633d6cac0e1b5a690311b6758d6f5d7c2be71acb0f1e14cd61"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58c889851ca33f992ea916b48b8540735055201b177cb0dcf0596a495a667b00"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30d8494870d9916bb53b2a4384948491444741cb9a38253c590e21f836b01222"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:be28e1a07f20391bb0b15ea03dcac3aade29fc773c5eb4bee2838e9b2cdde0cb"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:981670b4ce0110d8dcb3246410a4aabf5714db5d8ea63b15686bce1c914b1f83"}, + {file = "pyarrow-14.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:4756a2b373a28f6166c42711240643fb8bd6322467e9aacabd26b488fa41ec23"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:cf87e2cec65dd5cf1aa4aba918d523ef56ef95597b545bbaad01e6433851aa10"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:470ae0194fbfdfbf4a6b65b4f9e0f6e1fa0ea5b90c1ee6b65b38aecee53508c8"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6263cffd0c3721c1e348062997babdf0151301f7353010c9c9a8ed47448f82ab"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8089d7e77d1455d529dbd7cff08898bbb2666ee48bc4085203af1d826a33cc"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fada8396bc739d958d0b81d291cfd201126ed5e7913cb73de6bc606befc30226"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:2a145dab9ed7849fc1101bf03bcdc69913547f10513fdf70fc3ab6c0a50c7eee"}, + {file = "pyarrow-14.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:05fe7994745b634c5fb16ce5717e39a1ac1fac3e2b0795232841660aa76647cd"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:a8eeef015ae69d104c4c3117a6011e7e3ecd1abec79dc87fd2fac6e442f666ee"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c76807540989fe8fcd02285dd15e4f2a3da0b09d27781abec3adc265ddbeba1"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:450e4605e3c20e558485f9161a79280a61c55efe585d51513c014de9ae8d393f"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323cbe60210173ffd7db78bfd50b80bdd792c4c9daca8843ef3cd70b186649db"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0140c7e2b740e08c5a459439d87acd26b747fc408bde0a8806096ee0baaa0c15"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:e592e482edd9f1ab32f18cd6a716c45b2c0f2403dc2af782f4e9674952e6dd27"}, + {file = "pyarrow-14.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d264ad13605b61959f2ae7c1d25b1a5b8505b112715c961418c8396433f213ad"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:01e44de9749cddc486169cb632f3c99962318e9dacac7778315a110f4bf8a450"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0351fecf0e26e152542bc164c22ea2a8e8c682726fce160ce4d459ea802d69c"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c1f6110c386464fd2e5e4ea3624466055bbe681ff185fd6c9daa98f30a3f9a"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11e045dfa09855b6d3e7705a37c42e2dc2c71d608fab34d3c23df2e02df9aec3"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:097828b55321897db0e1dbfc606e3ff8101ae5725673498cbfa7754ee0da80e4"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1daab52050a1c48506c029e6fa0944a7b2436334d7e44221c16f6f1b2cc9c510"}, + {file = "pyarrow-14.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3f6d5faf4f1b0d5a7f97be987cf9e9f8cd39902611e818fe134588ee99bf0283"}, + {file = "pyarrow-14.0.1.tar.gz", hash = "sha256:b8b3f4fe8d4ec15e1ef9b599b94683c5216adaed78d5cb4c606180546d1e2ee1"}, +] + +[package.dependencies] +numpy = ">=1.16.6" + [[package]] name = "pycparser" version = "2.21" @@ -3261,4 +3309,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "b9985d182b0c350a39e12aeae274f2e809d1454f47b58b2d2a5fe8b8264418b7" +content-hash = "397df0f3e64b00fabebb36bf3c3576d94c2f34c2f34dcec223973a19e525d2e6" diff --git a/pyproject.toml b/pyproject.toml index 15a3b9b..0ea7b13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ zstandard = "^0.18.0" pgzip = "^0.3.2" scikit-learn = "^1.2.1" pre-commit = "^3.5.0" +pyarrow = "^14.0.1" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" From 980940f83366fed49e26f944d39a941b7872b36d Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 13 Dec 2023 11:23:10 +0000 Subject: [PATCH 32/40] add license data to log --- pgscatalog_utils/scorefile/scoringfileheader.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pgscatalog_utils/scorefile/scoringfileheader.py b/pgscatalog_utils/scorefile/scoringfileheader.py index a06622a..06d7f10 100644 --- a/pgscatalog_utils/scorefile/scoringfileheader.py +++ b/pgscatalog_utils/scorefile/scoringfileheader.py @@ -21,6 +21,11 @@ class ScoringFileHeader: HmPOS_build: GenomeBuild HmPOS_date: str format_version: str + license: str = ( + "PGS obtained from the Catalog should be cited appropriately, and " + "used in accordance with any licensing restrictions set by the authors. See EBI " + "Terms of Use (https://www.ebi.ac.uk/about/terms-of-use/) for additional details." + ) def __post_init__(self): if self.variants_number: @@ -38,6 +43,11 @@ def from_path(cls, path: pathlib.Path): header_dict = {k: raw_header.get(k) for k in keep_keys} # ... so we can unpack the dict into a dataclass + if header_dict.get("license") is None: + # missing license data in header means default license + # (this may change in the future) + header_dict["license"] = cls.license + if "HmPOS_build" not in header_dict: # working with pgs catalog formatted header but unharmonised data header_dict["HmPOS_build"] = None From 7d39e0859eff484ead5c595173b204d43d7fcdd2 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 14 Dec 2023 10:30:03 +0000 Subject: [PATCH 33/40] add custom exceptions --- pgscatalog_utils/pgsexceptions.py | 116 ++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 pgscatalog_utils/pgsexceptions.py diff --git a/pgscatalog_utils/pgsexceptions.py b/pgscatalog_utils/pgsexceptions.py new file mode 100644 index 0000000..e57e4bf --- /dev/null +++ b/pgscatalog_utils/pgsexceptions.py @@ -0,0 +1,116 @@ +""" This module defines a custom PGS exception hierarchy. There's a lot of exceptions for specific failure states, +which can be a bad approach and too complex. However, we did this anyway for a few reasons: + +1. There's only a few types of common errors (around a dozen, with 3-4 very common) +2. Want to exit the program with custom exit codes to simplify communicating program +state with external processes (e.g. PGS Catalog Calculator, web platforms) without doing +complicated things like logging to an external location +3. This approach should make maintaining exit codes simple + +So the plan is to override sys.excepthook, intercept errors defined here, and map them +to custom exit codes defined below +""" + +from types import MappingProxyType + + +class BasePGSError(Exception): + """The base class from which all PGS errors must inherit. + The purpose of this class is to simplify catching PGS exceptions and exiting python with a custom exit code.""" + + +class MatchError(BasePGSError): + """The base class for errors that are raised during variant matching""" + + +class CombineError(BasePGSError): + """The base class for errors that are raised when combining scorefiles""" + + +class CatalogError(BasePGSError): + """The base class for errors when querying or downloading from the PGS Catalog""" + + +class SamplesheetError(BasePGSError): + """The base class for errors related to samplesheet parsing""" + + +class ScoreDownloadError(CatalogError): + """Raised when a scoring file can't be downloaded""" + + +class ScoreChecksumError(CatalogError): + """Raised when a scoring file fails checksum validation""" + + +class QueryError(CatalogError): + """Raised when the Catalog API doesn't return a valid response""" + + +class InvalidAccessionError(CatalogError): + """Raised when an invalid term is used to query the Catalog""" + + +class DuplicateMatchError(MatchError): + """Raised when a matched variant has been duplicated, so that a variant with the same ID + would be split across two rows in an output scoring file. + """ + + +class MatchRateError(MatchError): + """Raised when match rate is below match threshold for one or more scoring files""" + + +class ZeroMatchesError(MatchError): + """Raised when zero matches are found for one or more scoring files. + + Distinct from MatchRateError because it's very common, and caused by bad input data or parameters.""" + + +class MatchValueError(MatchError): + """Raised when a match function receives inappropriate values. + + e.g., Multiple chromosomes detected in variant data but data is split per-chromosome""" + + +class BuildError(CombineError): + """Raised when there's a problem with a scoring file genome build.""" + + +class ScoreFormatError(CombineError): + """Raised when there's a problem with a scoring file.""" + + +class GenomesNotFound(SamplesheetError): + """Raised when FileNotFound""" + + +class SamplesheetFormatError(SamplesheetError): + """Raised when a samplesheet is badly formatted""" + + +class ExceptionExitCodeMap: + """A read only map to get exit codes for custom exceptions""" + + # https://unix.stackexchange.com/a/604262 + _mapping = { + ScoreDownloadError: 8, + ScoreFormatError: 9, + ScoreChecksumError: 10, + QueryError: 11, + InvalidAccessionError: 12, + DuplicateMatchError: 13, + MatchRateError: 14, + ZeroMatchesError: 15, + MatchValueError: 16, + BuildError: 17, + GenomesNotFound: 19, + SamplesheetFormatError: 20, + } + + code_map = MappingProxyType(_mapping) + + def get_exit_code(self, exception_type): + # if an exception can't be found in the map, return an error code (> 0) but default + # max possible value 255 + return self.code_map.get(exception_type, 255) From b92615006f38a7de007600892eb6facf65c62d30 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 14 Dec 2023 10:42:32 +0000 Subject: [PATCH 34/40] add custom exit code --- pgscatalog_utils/pgsexceptions.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/pgsexceptions.py b/pgscatalog_utils/pgsexceptions.py index e57e4bf..09b02d5 100644 --- a/pgscatalog_utils/pgsexceptions.py +++ b/pgscatalog_utils/pgsexceptions.py @@ -10,7 +10,7 @@ So the plan is to override sys.excepthook, intercept errors defined here, and map them to custom exit codes defined below """ - +import sys from types import MappingProxyType @@ -110,7 +110,17 @@ class ExceptionExitCodeMap: code_map = MappingProxyType(_mapping) - def get_exit_code(self, exception_type): + def __getitem__(self, exception_type): # if an exception can't be found in the map, return an error code (> 0) but default # max possible value 255 return self.code_map.get(exception_type, 255) + + +def handle_uncaught_exception(exctype, value, trace): + code_map = ExceptionExitCodeMap() + oldHook(exctype, value, trace) + if isinstance(value, BasePGSError): + sys.exit(code_map[exctype]) + + +sys.excepthook, oldHook = handle_uncaught_exception, sys.excepthook From 499ef557af020dd1e15ebc105ed0d36403aa25f7 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 14 Dec 2023 13:31:54 +0000 Subject: [PATCH 35/40] move class definitions --- pgscatalog_utils/pgsexceptions.py | 56 +++++++++++++++---------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/pgscatalog_utils/pgsexceptions.py b/pgscatalog_utils/pgsexceptions.py index 09b02d5..393ad7a 100644 --- a/pgscatalog_utils/pgsexceptions.py +++ b/pgscatalog_utils/pgsexceptions.py @@ -23,34 +23,6 @@ class MatchError(BasePGSError): """The base class for errors that are raised during variant matching""" -class CombineError(BasePGSError): - """The base class for errors that are raised when combining scorefiles""" - - -class CatalogError(BasePGSError): - """The base class for errors when querying or downloading from the PGS Catalog""" - - -class SamplesheetError(BasePGSError): - """The base class for errors related to samplesheet parsing""" - - -class ScoreDownloadError(CatalogError): - """Raised when a scoring file can't be downloaded""" - - -class ScoreChecksumError(CatalogError): - """Raised when a scoring file fails checksum validation""" - - -class QueryError(CatalogError): - """Raised when the Catalog API doesn't return a valid response""" - - -class InvalidAccessionError(CatalogError): - """Raised when an invalid term is used to query the Catalog""" - - class DuplicateMatchError(MatchError): """Raised when a matched variant has been duplicated, so that a variant with the same ID would be split across two rows in an output scoring file. @@ -73,6 +45,10 @@ class MatchValueError(MatchError): e.g., Multiple chromosomes detected in variant data but data is split per-chromosome""" +class CombineError(BasePGSError): + """The base class for errors that are raised when combining scorefiles""" + + class BuildError(CombineError): """Raised when there's a problem with a scoring file genome build.""" @@ -81,6 +57,30 @@ class ScoreFormatError(CombineError): """Raised when there's a problem with a scoring file.""" +class CatalogError(BasePGSError): + """The base class for errors when querying or downloading from the PGS Catalog""" + + +class ScoreDownloadError(CatalogError): + """Raised when a scoring file can't be downloaded""" + + +class ScoreChecksumError(CatalogError): + """Raised when a scoring file fails checksum validation""" + + +class QueryError(CatalogError): + """Raised when the Catalog API doesn't return a valid response""" + + +class InvalidAccessionError(CatalogError): + """Raised when an invalid term is used to query the Catalog""" + + +class SamplesheetError(BasePGSError): + """The base class for errors related to samplesheet parsing""" + + class GenomesNotFound(SamplesheetError): """Raised when FileNotFound""" From 064813a0d2e38cb79534b31253e50eec91632197 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 14 Dec 2023 13:54:33 +0000 Subject: [PATCH 36/40] rename --- pgscatalog_utils/pgsexceptions.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pgscatalog_utils/pgsexceptions.py b/pgscatalog_utils/pgsexceptions.py index 393ad7a..224025e 100644 --- a/pgscatalog_utils/pgsexceptions.py +++ b/pgscatalog_utils/pgsexceptions.py @@ -14,12 +14,13 @@ from types import MappingProxyType -class BasePGSError(Exception): +class BasePGSException(Exception): """The base class from which all PGS errors must inherit. - The purpose of this class is to simplify catching PGS exceptions and exiting python with a custom exit code.""" + The purpose of this class is to simplify finding PGS exceptions and exiting python + with a matching custom exit code.""" -class MatchError(BasePGSError): +class MatchError(BasePGSException): """The base class for errors that are raised during variant matching""" @@ -45,7 +46,7 @@ class MatchValueError(MatchError): e.g., Multiple chromosomes detected in variant data but data is split per-chromosome""" -class CombineError(BasePGSError): +class CombineError(BasePGSException): """The base class for errors that are raised when combining scorefiles""" @@ -57,7 +58,7 @@ class ScoreFormatError(CombineError): """Raised when there's a problem with a scoring file.""" -class CatalogError(BasePGSError): +class CatalogError(BasePGSException): """The base class for errors when querying or downloading from the PGS Catalog""" @@ -77,7 +78,7 @@ class InvalidAccessionError(CatalogError): """Raised when an invalid term is used to query the Catalog""" -class SamplesheetError(BasePGSError): +class SamplesheetError(BasePGSException): """The base class for errors related to samplesheet parsing""" @@ -119,7 +120,7 @@ def __getitem__(self, exception_type): def handle_uncaught_exception(exctype, value, trace): code_map = ExceptionExitCodeMap() oldHook(exctype, value, trace) - if isinstance(value, BasePGSError): + if isinstance(value, BasePGSException): sys.exit(code_map[exctype]) From 21873655c7bf961a707ec683614ace6527e3f536 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 15 Dec 2023 14:13:30 +0000 Subject: [PATCH 37/40] update effect allele class --- pgscatalog_utils/scorefile/effectallele.py | 43 ++++++++++++++++++---- pgscatalog_utils/scorefile/qc.py | 2 +- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/pgscatalog_utils/scorefile/effectallele.py b/pgscatalog_utils/scorefile/effectallele.py index 6f0dfcb..a72e3d1 100644 --- a/pgscatalog_utils/scorefile/effectallele.py +++ b/pgscatalog_utils/scorefile/effectallele.py @@ -1,10 +1,31 @@ class EffectAllele: - _valid_bases = frozenset({"A", "C", "T", "G"}) - __slots__ = ("allele", "is_valid") + """A class that represents an effect allele found in PGS Catalog scoring files - def __init__(self, allele: str): - self.allele = allele - self.is_valid = self.is_valid_allele() + The allele that's dosage is counted (e.g. {0, 1, 2}) and multiplied by the variant's + weight (effect_weight) when calculating score. The effect allele is also known as + the 'risk allele'. + >>> simple_ea = EffectAllele("A") + >>> simple_ea + EffectAllele("A") + >>> simple_ea.is_snp + True + >>> str(simple_ea) + 'A' + >>> EffectAllele("AG") + EffectAllele("AG") + >>> hla_example = EffectAllele("+") + >>> hla_example + EffectAllele("+") + >>> hla_example.is_snp + False + """ + + _valid_snp_bases = frozenset({"A", "C", "T", "G"}) + __slots__ = ("allele", "is_snp") + + def __init__(self, allele): + self.allele = str(allele) + self.is_snp = self._is_snp() def __repr__(self): return f'{type(self).__name__}("{self.allele}")' @@ -12,5 +33,13 @@ def __repr__(self): def __str__(self): return self.allele - def is_valid_allele(self) -> bool: - return not frozenset(self.allele) - self._valid_bases + def _is_snp(self) -> bool: + """SNPs are the most common type of effect allele. More complex effect + alleles, like HLAs or APOE genes, often require extra work to represent in + genomes. Users should be warned about complex effect alleles. + >>> EffectAllele("+")._is_snp() + False + >>> EffectAllele("A")._is_snp() + True + """ + return not frozenset(self.allele) - self._valid_snp_bases diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 50fcb52..526fda2 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -195,7 +195,7 @@ def check_effect_allele( ) -> typing.Generator[ScoreVariant, None, None]: n_bad = 0 for variant in variants: - if not variant.effect_allele.is_valid: + if not variant.effect_allele.is_snp: n_bad += 1 yield variant From 60e150b98688dfdacff92d5210d07c3e2c205bb0 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 15 Dec 2023 14:14:45 +0000 Subject: [PATCH 38/40] tidy up docstring --- pgscatalog_utils/scorefile/effectallele.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/scorefile/effectallele.py b/pgscatalog_utils/scorefile/effectallele.py index a72e3d1..be9ecb2 100644 --- a/pgscatalog_utils/scorefile/effectallele.py +++ b/pgscatalog_utils/scorefile/effectallele.py @@ -34,9 +34,10 @@ def __str__(self): return self.allele def _is_snp(self) -> bool: - """SNPs are the most common type of effect allele. More complex effect - alleles, like HLAs or APOE genes, often require extra work to represent in - genomes. Users should be warned about complex effect alleles. + """SNPs are the most common type of effect allele in PGS Catalog scoring + files. More complex effect alleles, like HLAs or APOE genes, often require + extra work to represent in genomes. Users should be warned about complex + effect alleles. >>> EffectAllele("+")._is_snp() False >>> EffectAllele("A")._is_snp() From 9d6e258e989a39f748930122314ed74fb994b485 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 15 Dec 2023 14:33:22 +0000 Subject: [PATCH 39/40] add docstrings to pytest --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 0ea7b13..bd94cb8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,3 +45,5 @@ seaborn = "^0.12.2" requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" +[tool.pytest.ini_options] +addopts = --doctest-modules \ No newline at end of file From 207ecd475030e9298970f4596b804a50cd992d1a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 15 Dec 2023 14:34:34 +0000 Subject: [PATCH 40/40] fix pyproject --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bd94cb8..0f82b6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,4 +46,4 @@ requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] -addopts = --doctest-modules \ No newline at end of file +addopts = "--doctest-modules" \ No newline at end of file