From 52a4eaac27ced3ee8f15ee8c2a1ac0034c74860f Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 30 Oct 2023 16:50:33 +0000
Subject: [PATCH 01/40] draft streaming with generators

---
 pgscatalog_utils/download/GenomeBuild.py      |  12 ++
 .../scorefile/combine_scorefiles.py           | 157 +++---------------
 pgscatalog_utils/scorefile/config.py          |   9 +
 pgscatalog_utils/scorefile/effect_type.py     |  34 ----
 pgscatalog_utils/scorefile/effect_weight.py   |  49 ------
 pgscatalog_utils/scorefile/genome_build.py    |  24 ---
 pgscatalog_utils/scorefile/harmonised.py      |  30 ----
 pgscatalog_utils/scorefile/header.py          |  80 +++++++++
 pgscatalog_utils/scorefile/liftover.py        | 103 ------------
 pgscatalog_utils/scorefile/qc.py              | 103 +++---------
 pgscatalog_utils/scorefile/read.py            |  79 ---------
 pgscatalog_utils/scorefile/scoringfile.py     | 144 ++++++++++++++++
 pgscatalog_utils/scorefile/write.py           |  43 -----
 13 files changed, 292 insertions(+), 575 deletions(-)
 create mode 100644 pgscatalog_utils/scorefile/config.py
 delete mode 100644 pgscatalog_utils/scorefile/effect_type.py
 delete mode 100644 pgscatalog_utils/scorefile/effect_weight.py
 delete mode 100644 pgscatalog_utils/scorefile/genome_build.py
 delete mode 100644 pgscatalog_utils/scorefile/harmonised.py
 create mode 100644 pgscatalog_utils/scorefile/header.py
 delete mode 100644 pgscatalog_utils/scorefile/liftover.py
 delete mode 100644 pgscatalog_utils/scorefile/read.py
 create mode 100644 pgscatalog_utils/scorefile/scoringfile.py
 delete mode 100644 pgscatalog_utils/scorefile/write.py

diff --git a/pgscatalog_utils/download/GenomeBuild.py b/pgscatalog_utils/download/GenomeBuild.py
index 419c3f2..893bf97 100644
--- a/pgscatalog_utils/download/GenomeBuild.py
+++ b/pgscatalog_utils/download/GenomeBuild.py
@@ -4,3 +4,15 @@
 class GenomeBuild(Enum):
     GRCh37 = auto()
     GRCh38 = auto()
+
+    @classmethod
+    def from_string(cls, build):
+        match build:
+            case 'GRCh37' | 'hg18':
+                return cls(GenomeBuild.GRCh37)
+            case 'GRCh38' | 'hg19':
+                return cls(GenomeBuild.GRCh38)
+            case 'NR':
+                return None
+            case _:
+                raise Exception
\ No newline at end of file
diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 9465484..7dd5fc8 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -1,39 +1,13 @@
 import argparse
 import logging
-import os
 import sys
 import textwrap
-import json
+import time
 
 from pgscatalog_utils.config import set_logging_level
-from pgscatalog_utils.scorefile.effect_type import set_effect_type
-from pgscatalog_utils.scorefile.effect_weight import melt_effect_weights
-from pgscatalog_utils.scorefile.genome_build import build2GRC
-from pgscatalog_utils.scorefile.harmonised import remap_harmonised
-from pgscatalog_utils.scorefile.liftover import liftover
-from pgscatalog_utils.scorefile.qc import quality_control
-from pgscatalog_utils.scorefile.read import load_scorefile, get_scorefile_basename
-from pgscatalog_utils.scorefile.write import write_scorefile
-
-
-headers2logs = [
-    'pgs_id',
-    'pgp_id',
-    'pgs_name',
-    'genome_build',
-    'variants_number',
-    'trait_reported',
-    'trait_efo',
-    'trait_mapped',
-    'weight_type',
-    'citation'
-]
-headers2logs_harmonisation = [
-    'HmPOS_build',
-    'HmPOS_date',
-    'HmPOS_match_chr',
-    'HmPOS_match_pos'
-]
+from pgscatalog_utils.download.GenomeBuild import GenomeBuild
+from pgscatalog_utils.scorefile.scoringfile import ScoringFile
+
 
 def combine_scorefiles():
     args = _parse_args()
@@ -44,103 +18,21 @@ def combine_scorefiles():
     paths: list[str] = list(set(args.scorefiles))  # unique paths only
     logger.debug(f"Input scorefiles: {paths}")
 
-    if os.path.exists(args.outfile):
-        logger.critical(f"Output file {args.outfile} already exists")
+    start_time = time.time()
+    sfs = [ScoringFile.from_path(x) for x in paths]
+
+    target_build = GenomeBuild.from_string(args.target_build)
+    bad_builds = [x.name for x in sfs if x.genome_build != target_build]
+    for bad_file in bad_builds:
+        logger.critical(f"{bad_file} doesn't match {target_build}, can't combine")
         raise Exception
+    else:
+        logger.info(f"All builds match target build {target_build}")
 
-    # Score header logs - init
-    score_logs = {}
-    dir_output = os.path.dirname(args.outfile)
-    if dir_output == '':
-        dir_output = './'
-    elif dir_output.endswith('/') is False:
-        dir_output += '/'
-    json_logs_file =  dir_output + args.logfile
-
-    for x in paths:
-        # Read scorefile df and header
-        h, score = load_scorefile(x)
-        score_shape_original = score.shape
-
-        if score.empty:
-            logger.critical(f"Empty scorefile {x} detected! Please check the input data")
-            raise Exception
-
-        # Check if we should use the harmonized positions
-        use_harmonised = False
-        current_build = None
-        if h.get('HmPOS_build') is not None:
-            if h.get('HmPOS_build') == args.target_build:
-                use_harmonised = True
-                current_build = h.get('HmPOS_build')
-            else:
-                logger.error(
-                    f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}")
-                raise Exception
-
-        # Process/QC score and check variant columns
-        score = (score.pipe(remap_harmonised, use_harmonised=use_harmonised)
-                 .pipe(quality_control, drop_missing=args.drop_missing)
-                 .pipe(melt_effect_weights)
-                 .pipe(set_effect_type))
-
-        # Annotate score with the genome_build (in GRCh notation)
-        if current_build is None:
-            current_build = build2GRC(h.get('genome_build'))
-            if current_build is None:
-                logger.error("Scorefile has no build information, "
-                             "please add the build to the header with "
-                             "('#genome_build=[insert variant build]")
-                raise Exception
-
-        score = score.assign(genome_build=current_build)
-
-        if (current_build != args.target_build) and (args.liftover is False):
-            logger.error(
-                f"Cannot combine {x} (build={h.get('genome_build')}) with target build {args.target_build} without liftover")
-            logger.error("Try running with --liftover and specifying the --chain_dir")
-            raise Exception
-
-        if args.liftover:
-            logger.debug("Annotating scorefile with liftover parameters")
-            score = liftover(score, args.chain_dir, args.min_lift, args.target_build)
-
-        if score.empty and (args.drop_missing is False):
-            logger.critical("Empty output score detected, something went wrong while combining")
-            raise Exception
-
-        write_scorefile(score, args.outfile)
-
-        # Build Score header logs
-        score_id = get_scorefile_basename(x)
-        score_header = score_logs[score_id] = {}
-        # Scoring file header information
-        for header in headers2logs:
-            header_val = h.get(header)
-            if (header in ['trait_efo', 'trait_mapped']) and (header_val is not None):
-                header_val = header_val.split('|')
-            score_header[header] = header_val
-        # Other header information
-        score_header['columns'] = list(score.columns)
-        score_header['use_liftover'] = False
-        if args.liftover:
-             score_header['use_liftover'] = True
-        # Harmonized header information
-        score_header['use_harmonised'] = use_harmonised
-        if use_harmonised:
-            score_header['sources'] = sorted(score['hm_source'].unique().tolist())
-            for hm_header in headers2logs_harmonisation:
-                hm_header_val = h.get(hm_header)
-                if hm_header_val:
-                    if hm_header.startswith('HmPOS_match'):
-                        hm_header_val = json.loads(hm_header_val)
-                    score_header[hm_header] = hm_header_val
-        if score_header['variants_number'] is None:
-            score_header['variants_number'] = score_shape_original[0]
-
-    # Write Score header logs file
-    with open(json_logs_file, 'w') as fp:
-        json.dump(score_logs, fp, indent=4)
+    ScoringFile.write_combined(sfs, args.outfile)
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    print(f"Elapsed time: {elapsed_time} seconds")
 
 
 def _description_text() -> str:
@@ -164,16 +56,21 @@ def _epilog_text() -> str:
 
 
 def _parse_args(args=None) -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
+    parser = argparse.ArgumentParser(description=_description_text(),
+                                     epilog=_epilog_text(),
                                      formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument('-s', '--scorefiles', dest='scorefiles', nargs='+',
-                        help='<Required> Scorefile path (wildcard * is OK)', required=True)
+                        help='<Required> Scorefile path (wildcard * is OK)',
+                        required=True)
     parser.add_argument('--liftover', dest='liftover',
-                        help='<Optional> Convert scoring file variants to target genome build?', action='store_true')
+                        help='<Optional> Convert scoring file variants to target genome build?',
+                        action='store_true')
     parser.add_argument('-t', '--target_build', dest='target_build',
-                        choices=['GRCh37', 'GRCh38'], help='<Required> Build of target genome',
+                        choices=['GRCh37', 'GRCh38'],
+                        help='<Required> Build of target genome',
                         required=True)
-    parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files',
+    parser.add_argument('-c', '--chain_dir', dest='chain_dir',
+                        help='Path to directory containing chain files',
                         required="--liftover" in sys.argv)
     parser.add_argument('-m', '--min_lift', dest='min_lift',
                         help='<Optional> If liftover, minimum proportion of variants lifted over',
diff --git a/pgscatalog_utils/scorefile/config.py b/pgscatalog_utils/scorefile/config.py
new file mode 100644
index 0000000..8dee364
--- /dev/null
+++ b/pgscatalog_utils/scorefile/config.py
@@ -0,0 +1,9 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class Config:
+    drop_missing: bool
+    liftover: bool
+    chain_dir: str
+    min_lift: float
\ No newline at end of file
diff --git a/pgscatalog_utils/scorefile/effect_type.py b/pgscatalog_utils/scorefile/effect_type.py
deleted file mode 100644
index 50c8c73..0000000
--- a/pgscatalog_utils/scorefile/effect_type.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import logging
-
-import pandas as pd
-
-logger = logging.getLogger(__name__)
-
-
-def set_effect_type(df: pd.DataFrame) -> pd.DataFrame:
-    if {'is_recessive', 'is_dominant'}.issubset(df.columns):
-        _check_effect_types(df)
-        return (df.assign(additive=lambda x: ~x["is_recessive"] & ~x["is_dominant"])
-                .assign(effect_type=lambda x: x[["is_recessive", "is_dominant", "additive"]].idxmax(1)))
-    else:
-        return _set_default_effect_type(df)
-
-
-def _check_effect_types(df: pd.DataFrame):
-    """ Check that only one effect type is set per variant """
-    bad_rows: pd.DataFrame = df[['is_dominant', 'is_recessive']].all(axis=1).any()
-
-    error = ''' ERROR: Bad variants in scorefile
-    is_recessive and is_dominant columns are both TRUE for a variant
-    These columns are mutually exclusive (both can't be true)
-    However, both can be FALSE for additive variant scores
-    '''
-    if bad_rows:
-        logger.error(error)
-        logger.error(bad_rows)
-        raise Exception
-
-
-def _set_default_effect_type(df: pd.DataFrame, effect_type: str = "additive") -> pd.DataFrame:
-    logger.debug(f'No effect types set, using default ({effect_type})')
-    return df.assign(effect_type=effect_type)
diff --git a/pgscatalog_utils/scorefile/effect_weight.py b/pgscatalog_utils/scorefile/effect_weight.py
deleted file mode 100644
index 4b95e0f..0000000
--- a/pgscatalog_utils/scorefile/effect_weight.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import logging
-import re
-
-import pandas as pd
-
-logger = logging.getLogger(__name__)
-
-
-def melt_effect_weights(df: pd.DataFrame) -> pd.DataFrame:
-    """ Ensure all dataframes are in long format, with one effect weight column and a score accession column """
-    elongate = _detect_multiple_weight_columns(df)
-
-    if elongate:
-        logger.debug("Melting effect weights")
-        return _melt(df)
-    else:
-        logger.debug("Skipping melt")
-        df['accession'] = df['filename']
-        return df
-
-
-def _detect_multiple_weight_columns(df: pd.DataFrame) -> bool:
-    """ Detect if multiple effect weight columns are present
-
-    Single weight format:
-    | chr_name | chr_pos | effect_allele | effect_weight
-
-    Multiple weight format:
-    | chr_name | chr_pos | effect_allele | effect_weight_score_1 | ... | effect_weight_score_n
-    """
-    columns: list[re.match | None] = [re.search("^effect_weight$", x) for x in df.columns.to_list()]
-    columns_suffix: list[re.match | None] = [re.search("^effect_weight_[A-Za-z0-9]+$", x) for x
-                                             in df.columns.to_list()]
-
-    if any([col for col in columns]):
-        logger.debug("Single effect weight column detected")
-        return False
-    elif any([col for col in columns_suffix]):
-        logger.debug("Multiple weight weight columns detected")
-        return True
-    else:
-        logger.error("ERROR: Missing valid effect weight columns")
-        raise Exception("Bad effect weights")
-
-
-def _melt(df: pd.DataFrame) -> pd.DataFrame:
-    """ Melt a multiple effect weight format """
-    ew_cols: list[str] = df.filter(regex="effect_weight_*").columns.to_list()
-    return df.melt(value_vars=ew_cols, value_name="effect_weight", var_name="accession")
diff --git a/pgscatalog_utils/scorefile/genome_build.py b/pgscatalog_utils/scorefile/genome_build.py
deleted file mode 100644
index 7ea4f09..0000000
--- a/pgscatalog_utils/scorefile/genome_build.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import logging
-
-import pandas as pd
-
-logger = logging.getLogger(__name__)
-
-
-def annotate_build(df: pd.DataFrame, target_build: str) -> pd.DataFrame:
-    """ Annotate the dataframe with genome build data  """
-    logger.debug(f"Annotating target build: {target_build}")
-    build_dict: dict = {'GRCh37': 'hg19', 'GRCh38': 'hg38', 'hg19': 'hg19', 'hg38': 'hg38'}  # standardise build names
-    df['chain_target_build'] = build_dict[target_build]
-    df = df.assign(chain_genome_build=[build_dict[x] for x in df['genome_build']])
-    return df
-
-
-def build2GRC(build):
-    """Map build names so they can be compared with GRCh37 and 38"""
-    build_2_GRC_dict = {'GRCh37': 'GRCh37', 'GRCh38': 'GRCh38', 'hg19': 'GRCh37',
-                        'hg38': 'GRCh38'}  # standardise build names
-    if pd.isnull(build):
-        return None
-    else:
-        return build_2_GRC_dict.get(build)
diff --git a/pgscatalog_utils/scorefile/harmonised.py b/pgscatalog_utils/scorefile/harmonised.py
deleted file mode 100644
index b56fb93..0000000
--- a/pgscatalog_utils/scorefile/harmonised.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import logging
-import re
-
-import pandas as pd
-
-logger = logging.getLogger(__name__)
-
-
-def remap_harmonised(df: pd.DataFrame, use_harmonised) -> pd.DataFrame:
-    """ Replace original columns with harmonised data, if available and appropriate """
-
-    if any([re.match("hm_\\w+", x) for x in df.columns]) and use_harmonised:
-        logger.debug("Harmonised columns detected and used")
-        hm_colnames: dict[str: str] = {'hm_chr': 'chr_name', 'hm_pos': 'chr_position',
-                                       'hm_inferOtherAllele': 'other_allele'}
-
-        if 'other_allele' not in df or all(df['other_allele'].isnull()):
-            logger.debug("other_allele column contains no information, replacing with hm_inferOtherAllele")
-            return (df.drop(['chr_name', 'chr_position', 'other_allele'], axis=1, errors='ignore')
-                    .rename(hm_colnames, axis=1))
-        else:
-            logger.debug("other_allele column contains information, dropping hm_inferOtherAllele")
-            return (df.drop(['chr_name', 'chr_position', 'hm_inferOtherAllele'], axis=1, errors='ignore')
-                    .rename(hm_colnames, axis=1))
-    elif any([re.match("hm_\\w+", x) for x in df.columns]) and not use_harmonised:
-        logger.debug(f"Harmonised columns detected but not used (use_harmonised={use_harmonised})")
-        return df
-    else:
-        logger.debug("Harmonised columns not detected")
-        return df
diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py
new file mode 100644
index 0000000..7fc0e4e
--- /dev/null
+++ b/pgscatalog_utils/scorefile/header.py
@@ -0,0 +1,80 @@
+import gzip
+import pathlib
+from dataclasses import dataclass
+
+from pgscatalog_utils.download.GenomeBuild import GenomeBuild
+
+
+@dataclass
+class ScoringFileHeader:
+    pgs_id: str
+    pgp_id: str
+    trait_efo: str
+    trait_reported: str
+    trait_mapped: str
+    pgs_name: str
+    genome_build: GenomeBuild
+    HmPOS_build: GenomeBuild
+    variants_number: int
+    format_version: str
+    citation: str
+
+    def __post_init__(self):
+        self.variants_number = int(self.variants_number)
+        self.genome_build = GenomeBuild.from_string(self.genome_build)
+        if self.HmPOS_build:
+            self.HmPOS_build = GenomeBuild.from_string(self.HmPOS_build)
+
+        if self.format_version != '2.0':
+            raise Exception("Only support v2 format")
+
+    @classmethod
+    def from_path(cls, path: pathlib.Path):
+        raw_header: dict = raw_header_to_dict(read_header(path))
+        # only keep keys needed by class (intersect)
+        keep_keys = ScoringFileHeader.__annotations__.keys()
+        header_dict = {k: raw_header[k] for k in raw_header.keys() & keep_keys}
+        # ... so we can unpack the dict into a dataclass
+
+        if len(header_dict) > 1 and 'HmPOS_build' not in header_dict:
+            # working with pgs catalog formatted header but unharmonised data
+            header_dict['HmPOS_build'] = None
+
+        if header_dict:
+            return ScoringFileHeader(**header_dict)
+        else:
+            # no header available
+            return None
+
+
+def raw_header_to_dict(header):
+    d = {}
+    for item in header:
+        key, value = item.split('=')
+        d[key[1:]] = value  # drop # character from key
+    return d
+
+
+def read_header(path: pathlib.Path):
+    """Parses the header of a PGS Catalog format scorefile into a dictionary"""
+    open_function = auto_open(path)
+    with open_function(path, 'rt') as f:
+        yield from _gen_header_lines(f)
+
+
+def _gen_header_lines(f):
+    for line in f:
+        if line.startswith('#'):
+            if '=' in line:
+                yield line.strip()
+        else:
+            # stop reading lines
+            break
+
+
+def auto_open(filepath):
+    with open(filepath, 'rb') as test_f:
+        if test_f.read(2) == b'\x1f\x8b':
+            return gzip.open
+        else:
+            return open
diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py
deleted file mode 100644
index 45258b1..0000000
--- a/pgscatalog_utils/scorefile/liftover.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import logging
-import os
-
-import pandas as pd
-import pyliftover
-
-from pgscatalog_utils.scorefile.genome_build import annotate_build
-
-logger = logging.getLogger(__name__)
-
-
-def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: str) -> pd.DataFrame:
-    """ Liftover genomic coordinates to a different genome build """
-    df = annotate_build(df, target_build)  # get chain_target_build (e.g. in hg notation to match chain files)
-
-    mapped, unmapped = pd.DataFrame(), pd.DataFrame()
-    no_liftover: pd.DataFrame = df.query('chain_target_build == chain_genome_build')
-    to_liftover: pd.DataFrame = df.query('chain_target_build != chain_genome_build')
-
-    if no_liftover.empty:
-        logger.debug("Liftover required for all scorefile variants")
-    else:
-        logger.debug("Skipping liftover for scorefiles with same build as target genome")
-        no_liftover.loc[:, ['lifted_chr', 'lifted_pos']] = no_liftover[
-            ['chr_name', 'chr_position']]  # assume col structure
-        no_liftover.assign(liftover=None)
-
-    if to_liftover.empty:
-        logger.debug("Liftover skipped because no variants required it")
-    else:
-        lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir)  # loads chain files
-        logger.debug("Lifting over scoring files")
-        lifted: pd.DataFrame = to_liftover.apply(_convert_coordinates, axis=1, lo_dict=lo)
-        to_liftover = pd.concat([to_liftover, lifted], axis=1)
-        logger.debug("Liftover complete")
-
-        mapped: pd.DataFrame = (to_liftover[~to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)]
-                                .assign(liftover=True))
-        unmapped: pd.DataFrame = (to_liftover[to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)] \
-                                  .assign(liftover=False))
-        _check_min_liftover(mapped, unmapped, min_lift)
-
-    return pd.concat([mapped, unmapped, no_liftover])
-
-
-def _check_min_liftover(mapped: pd.DataFrame, unmapped: pd.DataFrame, min_lift: float) -> None:
-    """ Check that liftover process met minimum parameters"""
-    df = pd.concat([mapped, unmapped])
-    n_variants: pd.DataFrame = (pd.DataFrame(df.groupby('accession')['liftover'].count())
-                                .reset_index()
-                                .rename({'liftover': 'n_var'}, axis=1))
-    lo_counts = (pd.DataFrame(df.groupby(['accession', 'liftover'])['liftover'].count()) \
-                 .rename_axis(['accession', 'liftover_status'])
-                 .reset_index())
-    summary: pd.DataFrame = lo_counts.merge(n_variants, on='accession')
-    summary['proportion'] = summary['liftover'] / summary['n_var']
-
-    for row in summary.query('liftover_status == True')[['accession', 'proportion']].itertuples():
-        if row.proportion < min_lift:
-            logger.error(f'Liftover failed for scorefile {row.accession}')
-            logger.error(f'{row.proportion} of variants lifted over, less than min_lift parameter ({min_lift})')
-            raise Exception
-        else:
-            logger.debug(f'Minimum liftover threshold passed for scorefile {row.accession}')
-
-
-def _convert_coordinates(df: pd.Series, lo_dict: dict[str, pyliftover.LiftOver]) -> pd.Series:
-    """ Convert genomic coordinates to different build """
-    converted: list[tuple[str, int, str, int]] | None
-
-    if df[['chr_name', 'chr_position']].isnull().values.any():
-        converted = None
-    else:
-        lo = lo_dict[df['chain_genome_build'] + df['chain_target_build']]  # extract lo object from dict
-        chrom: str = 'chr' + str(df['chr_name'])
-        pos: int = int(df['chr_position']) - 1  # liftOver is 0 indexed, VCF is 1 indexed
-        # converted example: [('chr22', 15460378, '+', 3320966530)] or None
-        converted = lo.convert_coordinate(chrom, pos)
-
-    if converted:
-        lifted_chrom: str = _parse_lifted_chrom(converted[0][0][3:])  # return first matching liftover
-        lifted_pos: int = int(converted[0][1]) + 1  # reverse 0 indexing
-        return pd.Series([lifted_chrom, lifted_pos], index=['lifted_chr', 'lifted_pos'])
-    else:
-        return pd.Series([None, None], index=['lifted_chr', 'lifted_pos'])
-
-
-def _parse_lifted_chrom(i: str) -> str:
-    """ Convert lifted chromosomes to tidy integers
-
-    liftover needs chr suffix for chromosome input (1 -> chr1), and it also
-    returns weird chromosomes sometimes (chr22 -> 22_KI270879v1_alt)
-    """
-    return i.split('_')[0]
-
-
-def _create_liftover(chain_dir: str) -> dict['str': pyliftover.LiftOver]:
-    """ Create LiftOver objects that can remap genomic coordinates """
-    builds: list[str] = ["hg19hg38", "hg38hg19"]
-    chains: list[str] = [os.path.join(chain_dir, x) for x in ["hg19ToHg38.over.chain.gz", "hg38ToHg19.over.chain.gz"]]
-    lo: list[pyliftover.LiftOver] = [pyliftover.LiftOver(x) for x in chains]
-    logger.debug("Chain files loaded for liftover")
-    return dict(zip(builds, lo))
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 68e511c..d40808f 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -1,92 +1,29 @@
 import logging
 
-import pandas as pd
-
 logger = logging.getLogger(__name__)
 
-
-def quality_control(df: pd.DataFrame, drop_missing: bool) -> pd.DataFrame:
-    """ Do quality control checks on a scorefile """
-    _check_shape(df)
-    _check_columns(df)
-    logger.debug("Quality control: checking for bad variants")
-    if drop_missing is True:
-        return (df.pipe(_drop_hla)
-                .pipe(_drop_missing_variants)
-                .pipe(_check_duplicate_identifiers)
-                .pipe(_drop_multiple_oa))
-    else:
-        return (df.pipe(_check_duplicate_identifiers)
-                .pipe(_drop_multiple_oa))
-
-
-def _drop_multiple_oa(df: pd.DataFrame) -> pd.DataFrame:
-    """ Set alleles to None in hm_inferOtherAllele if they contain multiple alleles
-
-    e.g. A / C / T -> None; A -> A; A / C -> None
-    """
-    if 'other_allele' in df:
-        if df['other_allele'].str.contains('/').any():
-            logger.debug("Multiple inferred other alleles detected, dropping other alleles for ambiguous variants")
-            df['other_allele'] = df['other_allele'].replace(regex='.+\\/.+', value=None)
-            return df
+def drop_hla(variants):
+    logger.info("Checking for HLA alleles")
+    for variant in variants:
+        if variant['effect_allele'] != 'P' or variant['effect_allele'] != 'N':
+            yield variant
         else:
-            logger.debug("Only single other alleles detected.")
-            return df
-    else:
-        logger.warning("No other allele data detected, skipping QC of other allele")
-        return df
-
-
-def _drop_missing_variants(df: pd.DataFrame) -> pd.DataFrame:
-    no_na: pd.DataFrame = df.dropna(subset=['chr_name', 'chr_position', 'effect_weight'])
-    n_dropped = df.shape[0] - no_na.shape[0]
-
-    if n_dropped > 0:
-        logger.warning(f"{n_dropped} variants with missing values detected and dropped from scoring file")
-
-    return no_na
-
-
-def _drop_hla(df: pd.DataFrame) -> pd.DataFrame:
-    """ Drop HLA effect alleles with present / absent encoding """
-
-    no_hla: pd.DataFrame = df.query('effect_allele != "P" | effect_allele != "N"')
-
-    if df.shape[0] > no_hla.shape[0]:
-        logger.debug("HLA alleles detected and dropped")
-
-    return no_hla
-
-
-def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame:
-    if 'other_allele' in df:
-        logger.debug("Other allele column detected, including other_allele in variant identifier")
-        group_cols = ['chr_name', 'chr_position', 'effect_allele', 'other_allele']
-    else:
-        logger.warning("Other allele column not detected, dropping other_allele from variant identifier.")
-        group_cols = ['chr_name', 'chr_position', 'effect_allele']
-
-    u_count: pd.Series = df.groupby(group_cols).size()
-
-    if all(u_count == 1):
-        return df.assign(is_duplicated=False)
-    else:
-        logger.warning("Duplicate variants in scoring file: {}".format(df['filename_prefix'].unique()))
-        u_count = u_count > 1
-        u_count.name = 'is_duplicated'
-        df = pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True)
-        df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False  # handles variants with null chr/pos
-        return df
+            logger.warning("HLA alleles detected and dropped")
 
 
-def _check_shape(df: pd.DataFrame) -> None:
-    assert len(df.columns) > 1, "ERROR: scorefile not formatted correctly (0 columns)"
-    assert df.shape[0] > 0, "ERROR: No variants detected in input file (0 rows)"
+def check_effect_weight(variants):
+    # don't actually use converted value
+    for variant in variants:
+        try:
+            float(variant['effect_weight'])
+        except ValueError:
+            logger.critical(f"{variant} has bad effect weight")
+            raise ValueError
+        yield variant
 
 
-def _check_columns(df: pd.DataFrame) -> None:
-    assert {'chr_name', 'chr_position'}.issubset(df.columns), "Missing chromosomal positions. If you're " \
-                                                              "using PGS Catalog files with rsIDs you should request " \
-                                                              "harmonised data files (HmPOS) instead."
-    assert 'effect_allele' in df, "ERROR: Missing effect allele column"
+def assign_other_allele(variants):
+    for variant in variants:
+        if 'other_allele' not in variant:
+            variant['other_allele'] = None
+        yield variant
diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py
deleted file mode 100644
index dbd559b..0000000
--- a/pgscatalog_utils/scorefile/read.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import gzip
-import io
-import logging
-import os
-
-import pandas as pd
-
-logger = logging.getLogger(__name__)
-
-
-def load_scorefile(path: str) -> tuple[dict, pd.DataFrame]:
-    logger.debug(f'Reading scorefile {path}')
-    df = pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False)
-    return (_read_header(path),
-            df.assign(filename_prefix=get_scorefile_basename(path), filename=path, row_nr=df.index))
-
-
-def _read_header(path: str) -> dict:
-    """Parses the header of a PGS Catalog format scorefle into a dictionary"""
-    f = io.TextIOWrapper(gzip.open(path, 'r'))
-    try:
-        f.readline()
-    except gzip.BadGzipFile:
-        f = open(path, 'r')
-
-    header = {}
-    lastline = '#'
-    while lastline.startswith('#'):
-        lastline = f.readline()
-        line = lastline.strip()
-        if line.startswith('#'):
-            if '=' in line:
-                line = line[1:].split('=')
-                field, val = [x.strip() for x in line]
-                if field in remap_header:
-                    header[remap_header[field]] = val
-                else:
-                    header[field] = val
-
-    if ('genome_build' in header) and (header['genome_build'] == 'NR'):
-        header['genome_build'] = None
-    f.close()
-    return header
-
-
-def _scorefile_dtypes() -> dict[str]:
-    """ Data types for columns that might be found in a scorefile """
-    return {'rsID': str, 'chr_name': str, 'chr_position': pd.UInt64Dtype(), 'effect_allele': 'str',
-            'effect_weight': float, 'locus_name': str, 'OR': float, 'hm_source': str, 'hm_rsID': str,
-            'hm_chr': str, 'hm_pos': pd.UInt64Dtype(), 'hm_inferOtherAllele': str}
-
-
-def get_scorefile_basename(path: str) -> str:
-    """ Return the basename of a scoring file without extension """
-    filename = os.path.basename(path)
-    if filename.endswith('.txt.gz'):
-        filename = filename.replace('.txt.gz', '')
-    elif filename.endswith('.txt'):
-        filename = filename.replace('.txt', '')
-    return filename
-
-
-remap_header = {
-    'PGS ID': 'pgs_id',
-    'PGS Name': 'pgs_name',
-    'Reported Trait': 'trait_reported',
-    'Original Genome Build': 'genome_build',
-    'Number of Variants': 'variants_number',
-    'PGP ID': 'pgp_id',
-    'Citation': 'citation',
-    'LICENSE': 'license',
-    # Harmonization related
-    'HmPOS Build': 'HmPOS_build',
-    'HmPOS Date': 'HmPOS_date',
-    'HmVCF Reference': 'HmVCF_ref',
-    'HmVCF Date': 'HmVCF_date',
-    'HmVCF N Matched Variants': 'HmVCF_n_matched',
-    'HmVCF N Unmapped Variants': 'HmVCF_n_unmapped'
-}  # Used to maintain reverse compatibility to old scoring files
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
new file mode 100644
index 0000000..7f9c864
--- /dev/null
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -0,0 +1,144 @@
+import csv
+import gzip
+import logging
+import os
+import pathlib
+import typing
+from dataclasses import dataclass
+
+from pgscatalog_utils.download.GenomeBuild import GenomeBuild
+from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open
+from pgscatalog_utils.scorefile.qc import drop_hla, check_effect_weight, \
+    assign_other_allele
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ScoringFile:
+    path: pathlib.Path
+    name: str
+    header: typing.Union[ScoringFileHeader, None]
+    genome_build: typing.Union[GenomeBuild, None]
+    harmonised: bool
+    fields: list[str]
+    variants: typing.Generator
+
+    def __post_init__(self):
+        if self.header.HmPOS_build:
+            logger.info(
+                f"{self.path} harmonised data detected: {self.header.HmPOS_build}")
+            self.genome_build = self.header.HmPOS_build
+
+        mandatory_columns = {'chr_name', 'effect_allele', 'effect_weight'}
+        if not mandatory_columns.issubset(self.fields) not in self.fields:
+            err_msg = f"{self.path} missing fields"
+            raise Exception(err_msg)
+
+    @classmethod
+    def from_path(cls, path: pathlib.Path):
+        header = ScoringFileHeader.from_path(path)
+        if header:
+            name = header.pgs_id
+            if header.HmPOS_build:
+                harmonised = True
+                genome_build = header.HmPOS_build
+            else:
+                harmonised = False
+                genome_build = header.genome_build
+        else:
+            harmonised = False
+            genome_build = None
+            name = os.path.basename(path).split('.')[0]
+
+        start_line, cols = get_columns(path)
+
+        # generate variants (a list of dicts, one for each variants)
+        variants = ScoringFile.read_variants(path=path, start_line=start_line,
+                                             fields=cols, name=name)
+
+        # note: these generator expressions aren't doing a bunch of iterations
+        # it's just a data processing pipeline
+        variants = remap_harmonised(variants)
+
+        # quality control
+        variants = drop_hla(variants)
+        variants = check_effect_weight(variants)
+        variants = assign_other_allele(variants)
+
+        return cls(path=path, header=header, genome_build=genome_build,
+                   harmonised=harmonised,
+                   fields=cols,
+                   variants=variants,
+                   name=name)
+
+    @staticmethod
+    def read_variants(path, fields, start_line, name: str):
+        open_function = auto_open(path)
+        with open_function(path, 'rt') as f:
+            logger.info(f"Generating variants from {path}")
+            csv_reader = csv.reader(f, delimiter='\t')
+            for i, row in enumerate(csv_reader):
+                if i > start_line:
+                    variant = dict(zip(fields, row)) | {'name': name}
+                    keys = ["chr_name", "chr_position", "effect_allele", "other_allele",
+                            "effect_weight", "hm_chr", "hm_pos", "hm_inferOtherAllele",
+                            "name", "is_dominant", "is_recessive"]
+                    yield {k: variant[k] for k in keys if k in variant}
+
+    @staticmethod
+    def write_combined(scoring_files, out_path):
+        if out_path.endswith("gz"):
+            open_function = gzip.open
+        else:
+            open_function = open
+
+        with open_function(out_path, 'wt') as f:
+            fieldnames = ["name", "chr_name", "chr_position", "effect_allele",
+                          "other_allele",
+                          "effect_weight"]
+            writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
+            writer.writeheader()
+
+            # write out in chunks for compression efficiency and speed
+            chunk_size = 10000
+            chunk = []
+            for scoring_file in scoring_files:
+                logger.info(f"Writing variants from scoring file {scoring_file}")
+                for variant in scoring_file.variants:
+                    chunk.append(variant)
+                    if len(chunk) == chunk_size:
+                        writer.writerows(chunk)
+                        chunk = []
+                # handle last chunk
+                if chunk:
+                    writer.writerows(chunk)
+
+
+def remap_harmonised(variants):
+    logger.info("Using harmonised data if available")
+    for variant in variants:
+        # _always_ use harmonised information, even if missing
+        if 'hm_chr' in variant:
+            variant['chr_name'] = variant['hm_chr']
+
+        if 'hm_pos' in variant:
+            variant['chr_position'] = variant['hm_pos']
+
+        if 'hm_inferOtherAllele' in variant and variant.get('other_allele') is None:
+            logger.warning("Replacing missing other_allele with inferred other allele")
+            variant['other_allele'] = variant['hm_inferOtherAllele']
+
+        yield {k: v for k, v in variant.items() if not k.startswith("hm")}
+
+
+def get_columns(path) -> tuple[int, list[str]]:
+    open_function = auto_open(path)
+    with open_function(path, 'rt') as f:
+        for i, line in enumerate(f):
+            if line.startswith('#'):
+                continue
+            return i, line.strip().split('\t')
+
+
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
deleted file mode 100644
index 8a3233b..0000000
--- a/pgscatalog_utils/scorefile/write.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import logging
-import os
-
-import pandas as pd
-
-logger = logging.getLogger(__name__)
-
-
-def write_scorefile(df: pd.DataFrame, path: str) -> None:
-    cols: list[str] = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type',
-                       'is_duplicated', 'accession', 'row_nr']
-
-    if os.path.exists(path):
-        logger.debug("Output file exists: setting write mode to append")
-        write_mode = 'a'
-        header = False
-    else:
-        logger.debug("Output file doesn't exist: setting write mode to write (create new file)")
-        write_mode = 'w'
-        header = True
-
-    out_df: pd.DataFrame = (df.drop('accession', axis=1)
-                            .rename({'filename_prefix': 'accession'}, axis=1)
-                            .pipe(_filter_failed_liftover))
-
-    if 'other_allele' not in out_df:
-        logger.warning("No other allele information detected, writing out as missing data")
-        out_df['other_allele'] = None
-
-    if path.endswith('.gz'):
-        logger.debug("Writing out gzip-compressed combined scorefile")
-        out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip', mode=write_mode, header=header)
-    else:
-        logger.debug("Writing out combined scorefile")
-        out_df[cols].to_csv(path, index=False, sep="\t", mode=write_mode, header=header)
-
-
-def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame:
-    if 'liftover' in df:
-        logger.debug("Filtering variants that failed liftover")
-        return df.query('liftover == True')
-    else:
-        return df

From 65b2b798e8187e5de7b478a91c4c28db591c4945 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 30 Oct 2023 17:16:14 +0000
Subject: [PATCH 02/40] set up effect types

---
 pgscatalog_utils/scorefile/qc.py          | 46 ++++++++++++++++++++++-
 pgscatalog_utils/scorefile/scoringfile.py | 31 +++------------
 2 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index d40808f..2e51008 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -1,5 +1,6 @@
 import logging
 
+
 logger = logging.getLogger(__name__)
 
 def drop_hla(variants):
@@ -12,10 +13,10 @@ def drop_hla(variants):
 
 
 def check_effect_weight(variants):
-    # don't actually use converted value
+    logger.info("Checking effect weights")
     for variant in variants:
         try:
-            float(variant['effect_weight'])
+            variant['effect_weight'] = float(variant['effect_weight'])
         except ValueError:
             logger.critical(f"{variant} has bad effect weight")
             raise ValueError
@@ -27,3 +28,44 @@ def assign_other_allele(variants):
         if 'other_allele' not in variant:
             variant['other_allele'] = None
         yield variant
+
+def assign_effect_type(variants):
+    logger.info("Assigning effect types")
+    for variant in variants:
+        if 'is_recessive' not in variant and 'is_dominant' not in variant:
+            variant['effect_type'] = 'additive'
+
+        if 'is_recessive' in variant or 'is_dominant' in variant:
+            logger.info("Recessive or dominant variant detected")
+            if variant['is_recessive']:
+                variant['effect_type'] = 'recessive'
+            elif variant['is_dominant']:
+                variant['effect_type'] = 'dominant'
+            elif variant['is_recessive'] and variant['is_dominant']:
+                logger.critical(f"Bad effect type setting: {variant}")
+                raise Exception
+
+            variant.pop('is_recessive')
+            variant.pop('is_dominant')
+
+        yield variant
+
+
+def remap_harmonised(variants, harmonised: bool):
+    if harmonised:
+        logger.info("Using harmonised data fields")
+    else:
+        logger.info("Harmonised data fields not available")
+
+    for variant in variants:
+        if harmonised:
+            variant['chr_name'] = variant['hm_chr']
+            variant['chr_position'] = variant['hm_pos']
+
+            if 'hm_inferOtherAllele' in variant and variant.get('other_allele') is None:
+                logger.debug("Replacing missing other_allele with inferred other allele")
+                variant['other_allele'] = variant['hm_inferOtherAllele']
+
+            yield {k: v for k, v in variant.items() if not k.startswith("hm")}
+        else:
+            yield variant
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index 7f9c864..db573e9 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -9,7 +9,7 @@
 from pgscatalog_utils.download.GenomeBuild import GenomeBuild
 from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open
 from pgscatalog_utils.scorefile.qc import drop_hla, check_effect_weight, \
-    assign_other_allele
+    assign_other_allele, assign_effect_type, remap_harmonised
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -55,15 +55,15 @@ def from_path(cls, path: pathlib.Path):
         start_line, cols = get_columns(path)
 
         # generate variants (a list of dicts, one for each variants)
+        logger.info(f"Lazily reading variants from {path}")
         variants = ScoringFile.read_variants(path=path, start_line=start_line,
                                              fields=cols, name=name)
 
         # note: these generator expressions aren't doing a bunch of iterations
         # it's just a data processing pipeline
-        variants = remap_harmonised(variants)
-
-        # quality control
+        variants = remap_harmonised(variants, harmonised)
         variants = drop_hla(variants)
+        variants = assign_effect_type(variants)
         variants = check_effect_weight(variants)
         variants = assign_other_allele(variants)
 
@@ -77,7 +77,6 @@ def from_path(cls, path: pathlib.Path):
     def read_variants(path, fields, start_line, name: str):
         open_function = auto_open(path)
         with open_function(path, 'rt') as f:
-            logger.info(f"Generating variants from {path}")
             csv_reader = csv.reader(f, delimiter='\t')
             for i, row in enumerate(csv_reader):
                 if i > start_line:
@@ -96,8 +95,7 @@ def write_combined(scoring_files, out_path):
 
         with open_function(out_path, 'wt') as f:
             fieldnames = ["name", "chr_name", "chr_position", "effect_allele",
-                          "other_allele",
-                          "effect_weight"]
+                          "other_allele", "effect_weight", "effect_type"]
             writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
             writer.writeheader()
 
@@ -105,7 +103,7 @@ def write_combined(scoring_files, out_path):
             chunk_size = 10000
             chunk = []
             for scoring_file in scoring_files:
-                logger.info(f"Writing variants from scoring file {scoring_file}")
+                logger.info(f"Writing {scoring_file.name} variants")
                 for variant in scoring_file.variants:
                     chunk.append(variant)
                     if len(chunk) == chunk_size:
@@ -116,23 +114,6 @@ def write_combined(scoring_files, out_path):
                     writer.writerows(chunk)
 
 
-def remap_harmonised(variants):
-    logger.info("Using harmonised data if available")
-    for variant in variants:
-        # _always_ use harmonised information, even if missing
-        if 'hm_chr' in variant:
-            variant['chr_name'] = variant['hm_chr']
-
-        if 'hm_pos' in variant:
-            variant['chr_position'] = variant['hm_pos']
-
-        if 'hm_inferOtherAllele' in variant and variant.get('other_allele') is None:
-            logger.warning("Replacing missing other_allele with inferred other allele")
-            variant['other_allele'] = variant['hm_inferOtherAllele']
-
-        yield {k: v for k, v in variant.items() if not k.startswith("hm")}
-
-
 def get_columns(path) -> tuple[int, list[str]]:
     open_function = auto_open(path)
     with open_function(path, 'rt') as f:

From 227f32f31611b03f81596908ae01d669cbd1e17f Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 31 Oct 2023 14:14:42 +0000
Subject: [PATCH 03/40] profiling improvements

---
 .../scorefile/combine_scorefiles.py           | 10 +++-
 pgscatalog_utils/scorefile/config.py          |  4 +-
 pgscatalog_utils/scorefile/header.py          | 14 ++++-
 pgscatalog_utils/scorefile/qc.py              | 22 ++++----
 pgscatalog_utils/scorefile/scoringfile.py     | 56 ++++++-------------
 pgscatalog_utils/scorefile/write.py           | 42 ++++++++++++++
 6 files changed, 92 insertions(+), 56 deletions(-)
 create mode 100644 pgscatalog_utils/scorefile/write.py

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 7dd5fc8..82ed66e 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -6,7 +6,9 @@
 
 from pgscatalog_utils.config import set_logging_level
 from pgscatalog_utils.download.GenomeBuild import GenomeBuild
+from pgscatalog_utils.scorefile.config import Config
 from pgscatalog_utils.scorefile.scoringfile import ScoringFile
+from pgscatalog_utils.scorefile.write import write_combined
 
 
 def combine_scorefiles():
@@ -15,6 +17,9 @@ def combine_scorefiles():
     logger = logging.getLogger(__name__)
     set_logging_level(args.verbose)
 
+    Config.threads = args.threads
+    Config.batch_size = 20000
+
     paths: list[str] = list(set(args.scorefiles))  # unique paths only
     logger.debug(f"Input scorefiles: {paths}")
 
@@ -29,7 +34,8 @@ def combine_scorefiles():
     else:
         logger.info(f"All builds match target build {target_build}")
 
-    ScoringFile.write_combined(sfs, args.outfile)
+    write_combined(sfs, args.outfile)
+
     end_time = time.time()
     elapsed_time = end_time - start_time
     print(f"Elapsed time: {elapsed_time} seconds")
@@ -75,6 +81,8 @@ def _parse_args(args=None) -> argparse.Namespace:
     parser.add_argument('-m', '--min_lift', dest='min_lift',
                         help='<Optional> If liftover, minimum proportion of variants lifted over',
                         required="--liftover" in sys.argv, default=0.95, type=float)
+    parser.add_argument('--threads', dest='threads', required=False,
+                        default=1, type=int)
     parser.add_argument('--drop_missing', dest='drop_missing', action='store_true',
                         help='<Optional> Drop variants with missing information (chr/pos) and '
                              'non-standard alleles (e.g. HLA=P/N) from the output file.')
diff --git a/pgscatalog_utils/scorefile/config.py b/pgscatalog_utils/scorefile/config.py
index 8dee364..10bc5b3 100644
--- a/pgscatalog_utils/scorefile/config.py
+++ b/pgscatalog_utils/scorefile/config.py
@@ -3,7 +3,9 @@
 
 @dataclass
 class Config:
+    threads: int
     drop_missing: bool
     liftover: bool
     chain_dir: str
-    min_lift: float
\ No newline at end of file
+    min_lift: float
+    batch_size: int
\ No newline at end of file
diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py
index 7fc0e4e..bf9447b 100644
--- a/pgscatalog_utils/scorefile/header.py
+++ b/pgscatalog_utils/scorefile/header.py
@@ -2,6 +2,9 @@
 import pathlib
 from dataclasses import dataclass
 
+from pgscatalog_utils.scorefile.config import Config
+from pgzip import pgzip
+
 from pgscatalog_utils.download.GenomeBuild import GenomeBuild
 
 
@@ -75,6 +78,13 @@ def _gen_header_lines(f):
 def auto_open(filepath):
     with open(filepath, 'rb') as test_f:
         if test_f.read(2) == b'\x1f\x8b':
-            return gzip.open
+            gzipped = True
         else:
-            return open
+            gzipped = False
+
+    if gzipped and Config.threads > 1:
+        return gzip.open
+    elif gzipped:
+        return pgzip.open
+    elif not gzipped:
+        return open
\ No newline at end of file
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 2e51008..69eb660 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -3,8 +3,16 @@
 
 logger = logging.getLogger(__name__)
 
+def quality_control(variants, harmonised):
+    variants = remap_harmonised(variants, harmonised)
+    variants = drop_hla(variants)
+    variants = assign_effect_type(variants)
+    variants = check_effect_weight(variants)
+    variants = assign_other_allele(variants)
+    return variants
+
+
 def drop_hla(variants):
-    logger.info("Checking for HLA alleles")
     for variant in variants:
         if variant['effect_allele'] != 'P' or variant['effect_allele'] != 'N':
             yield variant
@@ -13,7 +21,6 @@ def drop_hla(variants):
 
 
 def check_effect_weight(variants):
-    logger.info("Checking effect weights")
     for variant in variants:
         try:
             variant['effect_weight'] = float(variant['effect_weight'])
@@ -30,7 +37,6 @@ def assign_other_allele(variants):
         yield variant
 
 def assign_effect_type(variants):
-    logger.info("Assigning effect types")
     for variant in variants:
         if 'is_recessive' not in variant and 'is_dominant' not in variant:
             variant['effect_type'] = 'additive'
@@ -45,18 +51,10 @@ def assign_effect_type(variants):
                 logger.critical(f"Bad effect type setting: {variant}")
                 raise Exception
 
-            variant.pop('is_recessive')
-            variant.pop('is_dominant')
-
         yield variant
 
 
 def remap_harmonised(variants, harmonised: bool):
-    if harmonised:
-        logger.info("Using harmonised data fields")
-    else:
-        logger.info("Harmonised data fields not available")
-
     for variant in variants:
         if harmonised:
             variant['chr_name'] = variant['hm_chr']
@@ -66,6 +64,6 @@ def remap_harmonised(variants, harmonised: bool):
                 logger.debug("Replacing missing other_allele with inferred other allele")
                 variant['other_allele'] = variant['hm_inferOtherAllele']
 
-            yield {k: v for k, v in variant.items() if not k.startswith("hm")}
+            yield variant
         else:
             yield variant
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index db573e9..54fed17 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -1,15 +1,16 @@
 import csv
-import gzip
 import logging
 import os
 import pathlib
 import typing
 from dataclasses import dataclass
+from itertools import islice
+
+from pgscatalog_utils.scorefile.config import Config
 
 from pgscatalog_utils.download.GenomeBuild import GenomeBuild
 from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open
-from pgscatalog_utils.scorefile.qc import drop_hla, check_effect_weight, \
-    assign_other_allele, assign_effect_type, remap_harmonised
+from pgscatalog_utils.scorefile.qc import quality_control
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -61,11 +62,7 @@ def from_path(cls, path: pathlib.Path):
 
         # note: these generator expressions aren't doing a bunch of iterations
         # it's just a data processing pipeline
-        variants = remap_harmonised(variants, harmonised)
-        variants = drop_hla(variants)
-        variants = assign_effect_type(variants)
-        variants = check_effect_weight(variants)
-        variants = assign_other_allele(variants)
+        variants = quality_control(variants, harmonised)
 
         return cls(path=path, header=header, genome_build=genome_build,
                    harmonised=harmonised,
@@ -77,42 +74,23 @@ def from_path(cls, path: pathlib.Path):
     def read_variants(path, fields, start_line, name: str):
         open_function = auto_open(path)
         with open_function(path, 'rt') as f:
-            csv_reader = csv.reader(f, delimiter='\t')
-            for i, row in enumerate(csv_reader):
-                if i > start_line:
+            for _ in range(start_line + 1):
+                # skip header
+                next(f)
+
+            while True:
+                batch = list(islice(f, Config.batch_size))
+                if not batch:
+                    break
+
+                csv_reader = csv.reader(batch, delimiter='\t')
+                for i, row in enumerate(csv_reader):
                     variant = dict(zip(fields, row)) | {'name': name}
                     keys = ["chr_name", "chr_position", "effect_allele", "other_allele",
                             "effect_weight", "hm_chr", "hm_pos", "hm_inferOtherAllele",
                             "name", "is_dominant", "is_recessive"]
                     yield {k: variant[k] for k in keys if k in variant}
 
-    @staticmethod
-    def write_combined(scoring_files, out_path):
-        if out_path.endswith("gz"):
-            open_function = gzip.open
-        else:
-            open_function = open
-
-        with open_function(out_path, 'wt') as f:
-            fieldnames = ["name", "chr_name", "chr_position", "effect_allele",
-                          "other_allele", "effect_weight", "effect_type"]
-            writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
-            writer.writeheader()
-
-            # write out in chunks for compression efficiency and speed
-            chunk_size = 10000
-            chunk = []
-            for scoring_file in scoring_files:
-                logger.info(f"Writing {scoring_file.name} variants")
-                for variant in scoring_file.variants:
-                    chunk.append(variant)
-                    if len(chunk) == chunk_size:
-                        writer.writerows(chunk)
-                        chunk = []
-                # handle last chunk
-                if chunk:
-                    writer.writerows(chunk)
-
 
 def get_columns(path) -> tuple[int, list[str]]:
     open_function = auto_open(path)
@@ -121,5 +99,3 @@ def get_columns(path) -> tuple[int, list[str]]:
             if line.startswith('#'):
                 continue
             return i, line.strip().split('\t')
-
-
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
new file mode 100644
index 0000000..df17bdc
--- /dev/null
+++ b/pgscatalog_utils/scorefile/write.py
@@ -0,0 +1,42 @@
+import csv
+import functools
+import gzip
+import logging
+from itertools import islice
+
+import pgzip
+
+from pgscatalog_utils.scorefile.config import Config
+from pgscatalog_utils.scorefile.scoringfile import ScoringFile
+
+logger = logging.getLogger(__name__)
+
+
+def write_combined(scoring_files: list[ScoringFile], out_path: str):
+    # compresslevel can be really slow, default is 9
+    if out_path.endswith("gz") and Config.threads == 1:
+        logger.info("Writing with gzip (slow)")
+        open_function = functools.partial(gzip.open, compresslevel=6)
+    elif Config.threads > 1:
+        logger.info("Writing with pgzip (fast)")
+        open_function = functools.partial(pgzip.open, compresslevel=6,
+                                          thread=Config.threads, blocksize=2 * 10 ** 8)
+    else:
+        logger.info("Writing text file (fast)")
+        open_function = open
+
+    with open_function(out_path, mode='wt') as f:
+        fieldnames = ["name", "chr_name", "chr_position", "effect_allele",
+                      "other_allele", "effect_weight", "effect_type"]
+        writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t',
+                                extrasaction='ignore')
+        writer.writeheader()
+
+        # write out in batches for compression efficiency and speed
+        for scoring_file in scoring_files:
+            logger.info(f"Writing {scoring_file.name} variants")
+            while True:
+                batch = list(islice(scoring_file.variants, Config.batch_size))
+                if not batch:
+                    break
+                writer.writerows(batch)

From 1349d0aff27fe430b5f99685201ba7fc70c43336 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 31 Oct 2023 14:58:45 +0000
Subject: [PATCH 04/40] fix output

---
 pgscatalog_utils/scorefile/combine_scorefiles.py |  2 +-
 pgscatalog_utils/scorefile/header.py             |  5 +++--
 pgscatalog_utils/scorefile/scoringfile.py        | 15 +++++++--------
 pgscatalog_utils/scorefile/write.py              |  6 +++---
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 82ed66e..6650459 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -27,7 +27,7 @@ def combine_scorefiles():
     sfs = [ScoringFile.from_path(x) for x in paths]
 
     target_build = GenomeBuild.from_string(args.target_build)
-    bad_builds = [x.name for x in sfs if x.genome_build != target_build]
+    bad_builds = [x.accession for x in sfs if x.genome_build != target_build]
     for bad_file in bad_builds:
         logger.critical(f"{bad_file} doesn't match {target_build}, can't combine")
         raise Exception
diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py
index bf9447b..78259c6 100644
--- a/pgscatalog_utils/scorefile/header.py
+++ b/pgscatalog_utils/scorefile/header.py
@@ -1,3 +1,4 @@
+import functools
 import gzip
 import pathlib
 from dataclasses import dataclass
@@ -83,8 +84,8 @@ def auto_open(filepath):
             gzipped = False
 
     if gzipped and Config.threads > 1:
-        return gzip.open
+        return functools.partial(pgzip.open, thread=Config.threads)
     elif gzipped:
-        return pgzip.open
+        return gzip.open
     elif not gzipped:
         return open
\ No newline at end of file
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index 54fed17..4122022 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -19,7 +19,7 @@
 @dataclass
 class ScoringFile:
     path: pathlib.Path
-    name: str
+    accession: str
     header: typing.Union[ScoringFileHeader, None]
     genome_build: typing.Union[GenomeBuild, None]
     harmonised: bool
@@ -40,8 +40,8 @@ def __post_init__(self):
     @classmethod
     def from_path(cls, path: pathlib.Path):
         header = ScoringFileHeader.from_path(path)
+        name = os.path.basename(path).split('.')[0]
         if header:
-            name = header.pgs_id
             if header.HmPOS_build:
                 harmonised = True
                 genome_build = header.HmPOS_build
@@ -51,7 +51,6 @@ def from_path(cls, path: pathlib.Path):
         else:
             harmonised = False
             genome_build = None
-            name = os.path.basename(path).split('.')[0]
 
         start_line, cols = get_columns(path)
 
@@ -68,12 +67,12 @@ def from_path(cls, path: pathlib.Path):
                    harmonised=harmonised,
                    fields=cols,
                    variants=variants,
-                   name=name)
+                   accession=name)
 
     @staticmethod
     def read_variants(path, fields, start_line, name: str):
         open_function = auto_open(path)
-        with open_function(path, 'rt') as f:
+        with open_function(path, mode='rt') as f:
             for _ in range(start_line + 1):
                 # skip header
                 next(f)
@@ -85,16 +84,16 @@ def read_variants(path, fields, start_line, name: str):
 
                 csv_reader = csv.reader(batch, delimiter='\t')
                 for i, row in enumerate(csv_reader):
-                    variant = dict(zip(fields, row)) | {'name': name}
+                    variant = dict(zip(fields, row)) | {'accession': name, "row_nr": i }
                     keys = ["chr_name", "chr_position", "effect_allele", "other_allele",
                             "effect_weight", "hm_chr", "hm_pos", "hm_inferOtherAllele",
-                            "name", "is_dominant", "is_recessive"]
+                            "is_dominant", "is_recessive", "accession", "row_nr"]
                     yield {k: variant[k] for k in keys if k in variant}
 
 
 def get_columns(path) -> tuple[int, list[str]]:
     open_function = auto_open(path)
-    with open_function(path, 'rt') as f:
+    with open_function(path, mode='rt') as f:
         for i, line in enumerate(f):
             if line.startswith('#'):
                 continue
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index df17bdc..9345a87 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -26,15 +26,15 @@ def write_combined(scoring_files: list[ScoringFile], out_path: str):
         open_function = open
 
     with open_function(out_path, mode='wt') as f:
-        fieldnames = ["name", "chr_name", "chr_position", "effect_allele",
-                      "other_allele", "effect_weight", "effect_type"]
+        fieldnames = ["chr_name", "chr_position", "effect_allele",
+                      "other_allele", "effect_weight", "effect_type", "accession", "row_nr"]
         writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t',
                                 extrasaction='ignore')
         writer.writeheader()
 
         # write out in batches for compression efficiency and speed
         for scoring_file in scoring_files:
-            logger.info(f"Writing {scoring_file.name} variants")
+            logger.info(f"Writing {scoring_file.accession} variants")
             while True:
                 batch = list(islice(scoring_file.variants, Config.batch_size))
                 if not batch:

From 669eb8d76bedaf4544cf21f6eead9463e5aeba6d Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 1 Nov 2023 11:10:18 +0000
Subject: [PATCH 05/40] check for duplicates

---
 pgscatalog_utils/scorefile/qc.py          | 47 +++++++++++++++++++++--
 pgscatalog_utils/scorefile/scoringfile.py |  6 ++-
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 69eb660..a146dcc 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -1,17 +1,45 @@
 import logging
 
-
 logger = logging.getLogger(__name__)
 
+
 def quality_control(variants, harmonised):
     variants = remap_harmonised(variants, harmonised)
     variants = drop_hla(variants)
     variants = assign_effect_type(variants)
     variants = check_effect_weight(variants)
     variants = assign_other_allele(variants)
+    variants = check_duplicates(variants)
     return variants
 
 
+def check_duplicates(variants):
+    seen_ids = set()
+    current_accession = None
+
+    for variant in variants:
+        accession = variant['accession']
+
+        if accession != current_accession:
+            seen_ids = set()
+            current_accession = accession
+
+        # None other allele -> empty string
+        id = ":".join([str(variant[k] or "") for k in
+                       ['chr_name', 'chr_position', 'effect_allele', 'other_allele']])
+
+        if id in seen_ids:
+            logger.warning(
+                f"Duplicate variant found: {variant['accession']}: {id} {variant['row_nr']}")
+            variant['is_duplicated'] = True
+        else:
+            variant['is_duplicated'] = False
+
+        seen_ids.add(id)
+
+        yield variant
+
+
 def drop_hla(variants):
     for variant in variants:
         if variant['effect_allele'] != 'P' or variant['effect_allele'] != 'N':
@@ -32,10 +60,16 @@ def check_effect_weight(variants):
 
 def assign_other_allele(variants):
     for variant in variants:
+        if 'other_allele' in variant:
+            if '/' in variant['other_allele']:
+                # drop multiple other alleles
+                variant['other_allele'] = None
+
         if 'other_allele' not in variant:
             variant['other_allele'] = None
         yield variant
 
+
 def assign_effect_type(variants):
     for variant in variants:
         if 'is_recessive' not in variant and 'is_dominant' not in variant:
@@ -57,11 +91,16 @@ def assign_effect_type(variants):
 def remap_harmonised(variants, harmonised: bool):
     for variant in variants:
         if harmonised:
-            variant['chr_name'] = variant['hm_chr']
-            variant['chr_position'] = variant['hm_pos']
+            # if harmonised data are available, always overwrite
+            if variant['hm_chr']:
+                variant['chr_name'] = variant['hm_chr']
+
+            if variant['hm_pos']:
+                variant['chr_position'] = variant['hm_pos']
 
             if 'hm_inferOtherAllele' in variant and variant.get('other_allele') is None:
-                logger.debug("Replacing missing other_allele with inferred other allele")
+                logger.debug(
+                    "Replacing missing other_allele with inferred other allele")
                 variant['other_allele'] = variant['hm_inferOtherAllele']
 
             yield variant
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index 4122022..0d20df2 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -73,6 +73,7 @@ def from_path(cls, path: pathlib.Path):
     def read_variants(path, fields, start_line, name: str):
         open_function = auto_open(path)
         with open_function(path, mode='rt') as f:
+            row_nr = 0 # row_nr
             for _ in range(start_line + 1):
                 # skip header
                 next(f)
@@ -83,12 +84,13 @@ def read_variants(path, fields, start_line, name: str):
                     break
 
                 csv_reader = csv.reader(batch, delimiter='\t')
-                for i, row in enumerate(csv_reader):
-                    variant = dict(zip(fields, row)) | {'accession': name, "row_nr": i }
+                for row in csv_reader:
+                    variant = dict(zip(fields, row)) | {'accession': name, "row_nr": row_nr }
                     keys = ["chr_name", "chr_position", "effect_allele", "other_allele",
                             "effect_weight", "hm_chr", "hm_pos", "hm_inferOtherAllele",
                             "is_dominant", "is_recessive", "accession", "row_nr"]
                     yield {k: variant[k] for k in keys if k in variant}
+                    row_nr += 1
 
 
 def get_columns(path) -> tuple[int, list[str]]:

From bbfffbce90770aac40583e42a80b3f79f67e7281 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 1 Nov 2023 16:12:22 +0000
Subject: [PATCH 06/40] add liftover

---
 .../scorefile/combine_scorefiles.py           | 131 +++++++++++-----
 pgscatalog_utils/scorefile/config.py          |   8 +-
 pgscatalog_utils/scorefile/header.py          |  20 +--
 pgscatalog_utils/scorefile/liftover.py        |  64 ++++++++
 pgscatalog_utils/scorefile/qc.py              | 145 ++++++++++++------
 pgscatalog_utils/scorefile/scoringfile.py     |  64 +++++---
 pgscatalog_utils/scorefile/write.py           |  24 ++-
 7 files changed, 333 insertions(+), 123 deletions(-)
 create mode 100644 pgscatalog_utils/scorefile/liftover.py

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 6650459..e3ce606 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -7,6 +7,7 @@
 from pgscatalog_utils.config import set_logging_level
 from pgscatalog_utils.download.GenomeBuild import GenomeBuild
 from pgscatalog_utils.scorefile.config import Config
+from pgscatalog_utils.scorefile.liftover import create_liftover
 from pgscatalog_utils.scorefile.scoringfile import ScoringFile
 from pgscatalog_utils.scorefile.write import write_combined
 
@@ -19,6 +20,14 @@ def combine_scorefiles():
 
     Config.threads = args.threads
     Config.batch_size = 20000
+    Config.drop_missing = args.drop_missing
+    Config.target_build = GenomeBuild.from_string(args.target_build)
+    Config.liftover = args.liftover
+    Config.min_lift = args.min_lift
+
+    if args.chain_dir:
+        Config.chain_dir = args.chain_dir
+        Config.lo = create_liftover()
 
     paths: list[str] = list(set(args.scorefiles))  # unique paths only
     logger.debug(f"Input scorefiles: {paths}")
@@ -28,6 +37,7 @@ def combine_scorefiles():
 
     target_build = GenomeBuild.from_string(args.target_build)
     bad_builds = [x.accession for x in sfs if x.genome_build != target_build]
+
     for bad_file in bad_builds:
         logger.critical(f"{bad_file} doesn't match {target_build}, can't combine")
         raise Exception
@@ -42,7 +52,8 @@ def combine_scorefiles():
 
 
 def _description_text() -> str:
-    return textwrap.dedent('''\
+    return textwrap.dedent(
+        """\
     Combine multiple scoring files in PGS Catalog format (see https://www.pgscatalog.org/downloads/ 
     for details) to a 'long' table of columns needed for variant matching and subsequent calculation. 
 
@@ -51,50 +62,96 @@ def _description_text() -> str:
     unharmonised and harmonised PGS Catalog data. By default all variants are output (including 
     positions with duplicated data [often caused by rsID/liftover collions across builds]) and 
     variants with missing positions. 
-    ''')
+    """
+    )
 
 
 def _epilog_text() -> str:
-    return textwrap.dedent('''\
+    return textwrap.dedent(
+        """\
     The long table is used to simplify intersecting variants in target genotyping datasets 
     and the scoring files with the match_variants program.
-    ''')
+    """
+    )
 
 
 def _parse_args(args=None) -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description=_description_text(),
-                                     epilog=_epilog_text(),
-                                     formatter_class=argparse.RawDescriptionHelpFormatter)
-    parser.add_argument('-s', '--scorefiles', dest='scorefiles', nargs='+',
-                        help='<Required> Scorefile path (wildcard * is OK)',
-                        required=True)
-    parser.add_argument('--liftover', dest='liftover',
-                        help='<Optional> Convert scoring file variants to target genome build?',
-                        action='store_true')
-    parser.add_argument('-t', '--target_build', dest='target_build',
-                        choices=['GRCh37', 'GRCh38'],
-                        help='<Required> Build of target genome',
-                        required=True)
-    parser.add_argument('-c', '--chain_dir', dest='chain_dir',
-                        help='Path to directory containing chain files',
-                        required="--liftover" in sys.argv)
-    parser.add_argument('-m', '--min_lift', dest='min_lift',
-                        help='<Optional> If liftover, minimum proportion of variants lifted over',
-                        required="--liftover" in sys.argv, default=0.95, type=float)
-    parser.add_argument('--threads', dest='threads', required=False,
-                        default=1, type=int)
-    parser.add_argument('--drop_missing', dest='drop_missing', action='store_true',
-                        help='<Optional> Drop variants with missing information (chr/pos) and '
-                             'non-standard alleles (e.g. HLA=P/N) from the output file.')
-    parser.add_argument('-o', '--outfile', dest='outfile', required=True,
-                        default='combined.txt',
-                        help='<Required> Output path to combined long scorefile '
-                             '[ will compress output if filename ends with .gz ]')
-    parser.add_argument('-l', '--logfile', dest='logfile', default='log_combined.json',
-                        help='<Required> Name for the log file (score metadata) for combined scores.'
-                             '[ will write to identical directory as combined scorefile]')
-    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
-                        help='<Optional> Extra logging information')
+    parser = argparse.ArgumentParser(
+        description=_description_text(),
+        epilog=_epilog_text(),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "-s",
+        "--scorefiles",
+        dest="scorefiles",
+        nargs="+",
+        help="<Required> Scorefile path (wildcard * is OK)",
+        required=True,
+    )
+    parser.add_argument(
+        "--liftover",
+        dest="liftover",
+        help="<Optional> Convert scoring file variants to target genome build?",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-t",
+        "--target_build",
+        dest="target_build",
+        choices=["GRCh37", "GRCh38"],
+        help="<Required> Build of target genome",
+        required=True,
+    )
+    parser.add_argument(
+        "-c",
+        "--chain_dir",
+        dest="chain_dir",
+        help="Path to directory containing chain files",
+        required="--liftover" in sys.argv,
+    )
+    parser.add_argument(
+        "-m",
+        "--min_lift",
+        dest="min_lift",
+        help="<Optional> If liftover, minimum proportion of variants lifted over",
+        default=0.95,
+        type=float,
+    )
+    parser.add_argument(
+        "--threads", dest="threads", required=False, default=1, type=int
+    )
+    parser.add_argument(
+        "--drop_missing",
+        dest="drop_missing",
+        action="store_true",
+        help="<Optional> Drop variants with missing information (chr/pos) and "
+        "non-standard alleles (e.g. HLA=P/N) from the output file.",
+    )
+    parser.add_argument(
+        "-o",
+        "--outfile",
+        dest="outfile",
+        required=True,
+        default="combined.txt",
+        help="<Required> Output path to combined long scorefile "
+        "[ will compress output if filename ends with .gz ]",
+    )
+    parser.add_argument(
+        "-l",
+        "--logfile",
+        dest="logfile",
+        default="log_combined.json",
+        help="<Required> Name for the log file (score metadata) for combined scores."
+        "[ will write to identical directory as combined scorefile]",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        dest="verbose",
+        action="store_true",
+        help="<Optional> Extra logging information",
+    )
     return parser.parse_args(args)
 
 
diff --git a/pgscatalog_utils/scorefile/config.py b/pgscatalog_utils/scorefile/config.py
index 10bc5b3..2725110 100644
--- a/pgscatalog_utils/scorefile/config.py
+++ b/pgscatalog_utils/scorefile/config.py
@@ -1,11 +1,17 @@
 from dataclasses import dataclass
 
+import pyliftover
+
+from pgscatalog_utils.download.GenomeBuild import GenomeBuild
+
 
 @dataclass
 class Config:
     threads: int
     drop_missing: bool
     liftover: bool
+    lo: pyliftover.liftover
     chain_dir: str
     min_lift: float
-    batch_size: int
\ No newline at end of file
+    batch_size: int
+    target_build: GenomeBuild
diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py
index 78259c6..3a5e889 100644
--- a/pgscatalog_utils/scorefile/header.py
+++ b/pgscatalog_utils/scorefile/header.py
@@ -29,7 +29,7 @@ def __post_init__(self):
         if self.HmPOS_build:
             self.HmPOS_build = GenomeBuild.from_string(self.HmPOS_build)
 
-        if self.format_version != '2.0':
+        if self.format_version != "2.0":
             raise Exception("Only support v2 format")
 
     @classmethod
@@ -40,9 +40,9 @@ def from_path(cls, path: pathlib.Path):
         header_dict = {k: raw_header[k] for k in raw_header.keys() & keep_keys}
         # ... so we can unpack the dict into a dataclass
 
-        if len(header_dict) > 1 and 'HmPOS_build' not in header_dict:
+        if len(header_dict) > 1 and "HmPOS_build" not in header_dict:
             # working with pgs catalog formatted header but unharmonised data
-            header_dict['HmPOS_build'] = None
+            header_dict["HmPOS_build"] = None
 
         if header_dict:
             return ScoringFileHeader(**header_dict)
@@ -54,7 +54,7 @@ def from_path(cls, path: pathlib.Path):
 def raw_header_to_dict(header):
     d = {}
     for item in header:
-        key, value = item.split('=')
+        key, value = item.split("=")
         d[key[1:]] = value  # drop # character from key
     return d
 
@@ -62,14 +62,14 @@ def raw_header_to_dict(header):
 def read_header(path: pathlib.Path):
     """Parses the header of a PGS Catalog format scorefile into a dictionary"""
     open_function = auto_open(path)
-    with open_function(path, 'rt') as f:
+    with open_function(path, "rt") as f:
         yield from _gen_header_lines(f)
 
 
 def _gen_header_lines(f):
     for line in f:
-        if line.startswith('#'):
-            if '=' in line:
+        if line.startswith("#"):
+            if "=" in line:
                 yield line.strip()
         else:
             # stop reading lines
@@ -77,8 +77,8 @@ def _gen_header_lines(f):
 
 
 def auto_open(filepath):
-    with open(filepath, 'rb') as test_f:
-        if test_f.read(2) == b'\x1f\x8b':
+    with open(filepath, "rb") as test_f:
+        if test_f.read(2) == b"\x1f\x8b":
             gzipped = True
         else:
             gzipped = False
@@ -88,4 +88,4 @@ def auto_open(filepath):
     elif gzipped:
         return gzip.open
     elif not gzipped:
-        return open
\ No newline at end of file
+        return open
diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py
new file mode 100644
index 0000000..9924916
--- /dev/null
+++ b/pgscatalog_utils/scorefile/liftover.py
@@ -0,0 +1,64 @@
+import logging
+import os
+
+import pyliftover
+
+from pgscatalog_utils.download.GenomeBuild import GenomeBuild
+from pgscatalog_utils.scorefile.config import Config
+
+logger = logging.getLogger(__name__)
+
+
+def liftover(
+    variants, harmonised: bool, current_build: GenomeBuild, target_build: GenomeBuild
+):
+    if harmonised:
+        skip_lo = True
+    elif target_build == current_build:
+        skip_lo = True
+    else:
+        skip_lo = False
+
+    if skip_lo:
+        for variant in variants:
+            yield variant
+    else:
+        if current_build == GenomeBuild.GRCh37 and target_build == GenomeBuild.GRCh38:
+            lo: pyliftover.LiftOver = Config.lo["hg19hg38"]
+        elif current_build == GenomeBuild.GRCh38 and target_build == GenomeBuild.GRCh37:
+            lo: pyliftover.LiftOver = Config.lo["hg19hg38"]
+        else:
+            raise Exception("Can't get pyliftover object")
+
+        n_lifted = 0
+        n = 0
+
+        for variant in variants:
+            chrom = "chr" + variant["chr_name"]
+            pos = int(variant["chr_position"]) - 1  # VCF -> 1 based, UCSC -> 0 based
+            lifted = lo.convert_coordinate(chrom, pos)
+            if lifted:
+                variant["chr_name"] = lifted[0][0][3:].split("_")[0]
+                variant["chr_position"] = lifted[0][1] + 1  # reverse 0 indexing
+                n_lifted += 1
+            yield variant
+            n += 1
+
+        if (n_lifted / n) < Config.min_lift:
+            logger.error(f"Liftover failed")
+            raise Exception
+        else:
+            logger.info("Liftover successful")
+
+
+def create_liftover() -> dict["str" : pyliftover.LiftOver]:
+    """Create LiftOver objects that can remap genomic coordinates"""
+    chain_dir: str = Config.chain_dir
+    builds: list[str] = ["hg19hg38", "hg38hg19"]
+    chains: list[str] = [
+        os.path.join(chain_dir, x)
+        for x in ["hg19ToHg38.over.chain.gz", "hg38ToHg19.over.chain.gz"]
+    ]
+    lo: list[pyliftover.LiftOver] = [pyliftover.LiftOver(x) for x in chains]
+    logger.debug("Chain files loaded for liftover")
+    return dict(zip(builds, lo))
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index a146dcc..44d907f 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -1,57 +1,87 @@
 import logging
+import typing
+
+from pgscatalog_utils.scorefile.config import Config
+from pgscatalog_utils.scorefile.header import ScoringFileHeader
+from pgscatalog_utils.scorefile.liftover import liftover
 
 logger = logging.getLogger(__name__)
 
 
-def quality_control(variants, harmonised):
+def quality_control(variants, header: ScoringFileHeader, harmonised: bool):
     variants = remap_harmonised(variants, harmonised)
-    variants = drop_hla(variants)
+
+    if Config.drop_missing:
+        variants = drop_hla(variants)
+
     variants = assign_effect_type(variants)
     variants = check_effect_weight(variants)
     variants = assign_other_allele(variants)
     variants = check_duplicates(variants)
+
+    if Config.liftover:
+        variants = liftover(
+            variants,
+            harmonised=harmonised,
+            current_build=header.genome_build,
+            target_build=Config.target_build,
+        )
+
     return variants
 
 
 def check_duplicates(variants):
-    seen_ids = set()
-    current_accession = None
-
+    seen_ids: dict = {}
+    current_accession: typing.Union[str, None] = None
+    n_duplicates: int = 0
+    n_variants: int = 0
     for variant in variants:
-        accession = variant['accession']
+        accession: str = variant["accession"]
 
         if accession != current_accession:
-            seen_ids = set()
+            seen_ids = {}
             current_accession = accession
 
         # None other allele -> empty string
-        id = ":".join([str(variant[k] or "") for k in
-                       ['chr_name', 'chr_position', 'effect_allele', 'other_allele']])
+        id: str = ":".join(
+            [
+                str(variant[k] or "")
+                for k in ["chr_name", "chr_position", "effect_allele", "other_allele"]
+            ]
+        )
 
         if id in seen_ids:
-            logger.warning(
-                f"Duplicate variant found: {variant['accession']}: {id} {variant['row_nr']}")
-            variant['is_duplicated'] = True
+            variant["is_duplicated"] = True
+            n_duplicates += 1
         else:
-            variant['is_duplicated'] = False
+            variant["is_duplicated"] = False
 
-        seen_ids.add(id)
+        seen_ids[id] = True
 
         yield variant
+        n_variants += 1
+
+    if n_duplicates > 0:
+        logger.warning(
+            f"{n_duplicates} of {n_variants} variants are duplicated in: {current_accession}"
+        )
 
 
 def drop_hla(variants):
+    n_dropped = 0
     for variant in variants:
-        if variant['effect_allele'] != 'P' or variant['effect_allele'] != 'N':
+        if variant["effect_allele"] != "P" or variant["effect_allele"] != "N":
             yield variant
         else:
-            logger.warning("HLA alleles detected and dropped")
+            n_dropped += 1
+
+    logger.warning(f"{n_dropped} HLA alleles detected and dropped")
 
 
 def check_effect_weight(variants):
     for variant in variants:
         try:
-            variant['effect_weight'] = float(variant['effect_weight'])
+            variant["effect_weight"] = float(variant["effect_weight"])
         except ValueError:
             logger.critical(f"{variant} has bad effect weight")
             raise ValueError
@@ -59,29 +89,35 @@ def check_effect_weight(variants):
 
 
 def assign_other_allele(variants):
+    n_dropped = 0
     for variant in variants:
-        if 'other_allele' in variant:
-            if '/' in variant['other_allele']:
+        if "other_allele" in variant:
+            if "/" in variant["other_allele"]:
                 # drop multiple other alleles
-                variant['other_allele'] = None
+                n_dropped += 1
+                variant["other_allele"] = None
+        else:
+            variant["other_allele"] = None
 
-        if 'other_allele' not in variant:
-            variant['other_allele'] = None
         yield variant
 
+    if n_dropped > 0:
+        logger.warning(f"Multiple other_alleles detected in {n_dropped} variants")
+        logger.warning("Other allele for these variants is set to missing")
+
 
 def assign_effect_type(variants):
     for variant in variants:
-        if 'is_recessive' not in variant and 'is_dominant' not in variant:
-            variant['effect_type'] = 'additive'
+        if "is_recessive" not in variant and "is_dominant" not in variant:
+            variant["effect_type"] = "additive"
 
-        if 'is_recessive' in variant or 'is_dominant' in variant:
+        if "is_recessive" in variant or "is_dominant" in variant:
             logger.info("Recessive or dominant variant detected")
-            if variant['is_recessive']:
-                variant['effect_type'] = 'recessive'
-            elif variant['is_dominant']:
-                variant['effect_type'] = 'dominant'
-            elif variant['is_recessive'] and variant['is_dominant']:
+            if variant["is_recessive"]:
+                variant["effect_type"] = "recessive"
+            elif variant["is_dominant"]:
+                variant["effect_type"] = "dominant"
+            elif variant["is_recessive"] and variant["is_dominant"]:
                 logger.critical(f"Bad effect type setting: {variant}")
                 raise Exception
 
@@ -89,20 +125,37 @@ def assign_effect_type(variants):
 
 
 def remap_harmonised(variants, harmonised: bool):
-    for variant in variants:
-        if harmonised:
-            # if harmonised data are available, always overwrite
-            if variant['hm_chr']:
-                variant['chr_name'] = variant['hm_chr']
-
-            if variant['hm_pos']:
-                variant['chr_position'] = variant['hm_pos']
-
-            if 'hm_inferOtherAllele' in variant and variant.get('other_allele') is None:
-                logger.debug(
-                    "Replacing missing other_allele with inferred other allele")
-                variant['other_allele'] = variant['hm_inferOtherAllele']
-
-            yield variant
-        else:
+    n_bad = 0
+    if harmonised:
+        for variant in variants:
+            if variant["hm_chr"]:
+                variant["chr_name"] = variant["hm_chr"]
+
+            if variant["hm_pos"]:
+                variant["chr_position"] = variant["hm_pos"]
+
+            if "hm_inferOtherAllele" in variant and variant.get("other_allele") is None:
+                variant["other_allele"] = variant["hm_inferOtherAllele"]
+
+            if (
+                "chr_name" in variant
+                and "chr_position" in variant
+                and "effect_weight" in variant
+            ):
+                yield variant
+            elif Config.drop_missing:
+                continue
+                # (don't yield anything, filtering out missing variants)
+            else:
+                # assume a bad harmonisation with no genomic coordinates
+                # these will get labelled as duplicates eventually (probably)
+                variant["chr_name"] = None
+                variant["chr_position"] = None
+                yield variant
+                n_bad += 1
+    else:
+        for variant in variants:
             yield variant
+
+    if n_bad > 1:
+        logger.warning(f"{n_bad} variants failed harmonisation")
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index 0d20df2..e1aea8f 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -29,10 +29,11 @@ class ScoringFile:
     def __post_init__(self):
         if self.header.HmPOS_build:
             logger.info(
-                f"{self.path} harmonised data detected: {self.header.HmPOS_build}")
+                f"{self.path} harmonised data detected: {self.header.HmPOS_build}"
+            )
             self.genome_build = self.header.HmPOS_build
 
-        mandatory_columns = {'chr_name', 'effect_allele', 'effect_weight'}
+        mandatory_columns = {"chr_name", "effect_allele", "effect_weight"}
         if not mandatory_columns.issubset(self.fields) not in self.fields:
             err_msg = f"{self.path} missing fields"
             raise Exception(err_msg)
@@ -40,7 +41,7 @@ def __post_init__(self):
     @classmethod
     def from_path(cls, path: pathlib.Path):
         header = ScoringFileHeader.from_path(path)
-        name = os.path.basename(path).split('.')[0]
+        name = os.path.basename(path).split(".")[0]
         if header:
             if header.HmPOS_build:
                 harmonised = True
@@ -56,24 +57,29 @@ def from_path(cls, path: pathlib.Path):
 
         # generate variants (a list of dicts, one for each variants)
         logger.info(f"Lazily reading variants from {path}")
-        variants = ScoringFile.read_variants(path=path, start_line=start_line,
-                                             fields=cols, name=name)
+        variants = ScoringFile.read_variants(
+            path=path, start_line=start_line, fields=cols, name=name
+        )
 
         # note: these generator expressions aren't doing a bunch of iterations
         # it's just a data processing pipeline
-        variants = quality_control(variants, harmonised)
-
-        return cls(path=path, header=header, genome_build=genome_build,
-                   harmonised=harmonised,
-                   fields=cols,
-                   variants=variants,
-                   accession=name)
+        variants = quality_control(variants, header=header, harmonised=harmonised)
+
+        return cls(
+            path=path,
+            header=header,
+            genome_build=genome_build,
+            harmonised=harmonised,
+            fields=cols,
+            variants=variants,
+            accession=name,
+        )
 
     @staticmethod
     def read_variants(path, fields, start_line, name: str):
         open_function = auto_open(path)
-        with open_function(path, mode='rt') as f:
-            row_nr = 0 # row_nr
+        with open_function(path, mode="rt") as f:
+            row_nr = 0  # row_nr
             for _ in range(start_line + 1):
                 # skip header
                 next(f)
@@ -83,20 +89,34 @@ def read_variants(path, fields, start_line, name: str):
                 if not batch:
                     break
 
-                csv_reader = csv.reader(batch, delimiter='\t')
+                csv_reader = csv.reader(batch, delimiter="\t")
                 for row in csv_reader:
-                    variant = dict(zip(fields, row)) | {'accession': name, "row_nr": row_nr }
-                    keys = ["chr_name", "chr_position", "effect_allele", "other_allele",
-                            "effect_weight", "hm_chr", "hm_pos", "hm_inferOtherAllele",
-                            "is_dominant", "is_recessive", "accession", "row_nr"]
+                    variant = dict(zip(fields, row)) | {
+                        "accession": name,
+                        "row_nr": row_nr,
+                    }
+                    keys = [
+                        "chr_name",
+                        "chr_position",
+                        "effect_allele",
+                        "other_allele",
+                        "effect_weight",
+                        "hm_chr",
+                        "hm_pos",
+                        "hm_inferOtherAllele",
+                        "is_dominant",
+                        "is_recessive",
+                        "accession",
+                        "row_nr",
+                    ]
                     yield {k: variant[k] for k in keys if k in variant}
                     row_nr += 1
 
 
 def get_columns(path) -> tuple[int, list[str]]:
     open_function = auto_open(path)
-    with open_function(path, mode='rt') as f:
+    with open_function(path, mode="rt") as f:
         for i, line in enumerate(f):
-            if line.startswith('#'):
+            if line.startswith("#"):
                 continue
-            return i, line.strip().split('\t')
+            return i, line.strip().split("\t")
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 9345a87..8a31fb6 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -19,17 +19,27 @@ def write_combined(scoring_files: list[ScoringFile], out_path: str):
         open_function = functools.partial(gzip.open, compresslevel=6)
     elif Config.threads > 1:
         logger.info("Writing with pgzip (fast)")
-        open_function = functools.partial(pgzip.open, compresslevel=6,
-                                          thread=Config.threads, blocksize=2 * 10 ** 8)
+        open_function = functools.partial(
+            pgzip.open, compresslevel=6, thread=Config.threads, blocksize=2 * 10**8
+        )
     else:
         logger.info("Writing text file (fast)")
         open_function = open
 
-    with open_function(out_path, mode='wt') as f:
-        fieldnames = ["chr_name", "chr_position", "effect_allele",
-                      "other_allele", "effect_weight", "effect_type", "accession", "row_nr"]
-        writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t',
-                                extrasaction='ignore')
+    with open_function(out_path, mode="wt") as f:
+        fieldnames = [
+            "chr_name",
+            "chr_position",
+            "effect_allele",
+            "other_allele",
+            "effect_weight",
+            "effect_type",
+            "accession",
+            "row_nr",
+        ]
+        writer = csv.DictWriter(
+            f, fieldnames=fieldnames, delimiter="\t", extrasaction="ignore"
+        )
         writer.writeheader()
 
         # write out in batches for compression efficiency and speed

From 43046c7f1141bd63a52ef37aae75d1cf2eda7f26 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 1 Nov 2023 16:20:27 +0000
Subject: [PATCH 07/40] update dependencies and set up pre-commit

---
 .pre-commit-config.yaml |   6 +++
 poetry.lock             | 110 ++++++++++++++++++++++++++++++++++++++--
 pyproject.toml          |   1 +
 3 files changed, 114 insertions(+), 3 deletions(-)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..6e4ae2c
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,6 @@
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.1.3
+  hooks:
+    - id: ruff-format
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index dac6b3d..4c1fda4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
 
 [[package]]
 name = "anyio"
@@ -297,6 +297,17 @@ files = [
 [package.dependencies]
 pycparser = "*"
 
+[[package]]
+name = "cfgv"
+version = "3.4.0"
+description = "Validate configuration and produce human readable error messages."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
+    {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
+]
+
 [[package]]
 name = "charset-normalizer"
 version = "3.2.0"
@@ -671,6 +682,17 @@ files = [
     {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
 ]
 
+[[package]]
+name = "distlib"
+version = "0.3.7"
+description = "Distribution utilities"
+optional = false
+python-versions = "*"
+files = [
+    {file = "distlib-0.3.7-py2.py3-none-any.whl", hash = "sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057"},
+    {file = "distlib-0.3.7.tar.gz", hash = "sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8"},
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.1.3"
@@ -713,6 +735,22 @@ files = [
 [package.extras]
 devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
 
+[[package]]
+name = "filelock"
+version = "3.13.1"
+description = "A platform independent file lock."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "filelock-3.13.1-py3-none-any.whl", hash = "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c"},
+    {file = "filelock-3.13.1.tar.gz", hash = "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e"},
+]
+
+[package.extras]
+docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.24)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
+typing = ["typing-extensions (>=4.8)"]
+
 [[package]]
 name = "fonttools"
 version = "4.42.1"
@@ -781,6 +819,20 @@ files = [
     {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
 ]
 
+[[package]]
+name = "identify"
+version = "2.5.31"
+description = "File identification library for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "identify-2.5.31-py2.py3-none-any.whl", hash = "sha256:90199cb9e7bd3c5407a9b7e81b4abec4bb9d249991c79439ec8af740afc6293d"},
+    {file = "identify-2.5.31.tar.gz", hash = "sha256:7736b3c7a28233637e3c36550646fc6389bedd74ae84cb788200cc8e2dd60b75"},
+]
+
+[package.extras]
+license = ["ukkonen"]
+
 [[package]]
 name = "idna"
 version = "3.4"
@@ -1737,6 +1789,20 @@ files = [
     {file = "nest_asyncio-1.5.8.tar.gz", hash = "sha256:25aa2ca0d2a5b5531956b9e273b45cf664cae2b145101d73b86b199978d48fdb"},
 ]
 
+[[package]]
+name = "nodeenv"
+version = "1.8.0"
+description = "Node.js virtual environment builder"
+optional = false
+python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*"
+files = [
+    {file = "nodeenv-1.8.0-py2.py3-none-any.whl", hash = "sha256:df865724bb3c3adc86b3876fa209771517b0cfe596beff01a92700e0e8be4cec"},
+    {file = "nodeenv-1.8.0.tar.gz", hash = "sha256:d51e0c37e64fbf47d017feac3145cdbb58836d7eee8c6f6d3b6880c5456227d2"},
+]
+
+[package.dependencies]
+setuptools = "*"
+
 [[package]]
 name = "notebook"
 version = "7.0.4"
@@ -1871,8 +1937,8 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
     {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
+    {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
 ]
 python-dateutil = ">=2.8.1"
 pytz = ">=2020.1"
@@ -2082,6 +2148,24 @@ pyarrow = ["pyarrow (>=4.0.0)"]
 timezone = ["backports.zoneinfo", "tzdata"]
 xlsx2csv = ["xlsx2csv (>=0.8.0)"]
 
+[[package]]
+name = "pre-commit"
+version = "3.5.0"
+description = "A framework for managing and maintaining multi-language pre-commit hooks."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pre_commit-3.5.0-py2.py3-none-any.whl", hash = "sha256:841dc9aef25daba9a0238cd27984041fa0467b4199fc4852e27950664919f660"},
+    {file = "pre_commit-3.5.0.tar.gz", hash = "sha256:5804465c675b659b0862f07907f96295d490822a450c4c40e747d0b1c6ebcb32"},
+]
+
+[package.dependencies]
+cfgv = ">=2.0.0"
+identify = ">=1.0.0"
+nodeenv = ">=0.11.1"
+pyyaml = ">=5.1"
+virtualenv = ">=20.10.0"
+
 [[package]]
 name = "prometheus-client"
 version = "0.17.1"
@@ -3031,6 +3115,26 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.
 socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
 zstd = ["zstandard (>=0.18.0)"]
 
+[[package]]
+name = "virtualenv"
+version = "20.24.6"
+description = "Virtual Python Environment builder"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "virtualenv-20.24.6-py3-none-any.whl", hash = "sha256:520d056652454c5098a00c0f073611ccbea4c79089331f60bf9d7ba247bb7381"},
+    {file = "virtualenv-20.24.6.tar.gz", hash = "sha256:02ece4f56fbf939dbbc33c0715159951d6bf14aaf5457b092e4548e1382455af"},
+]
+
+[package.dependencies]
+distlib = ">=0.3.7,<1"
+filelock = ">=3.12.2,<4"
+platformdirs = ">=3.9.1,<4"
+
+[package.extras]
+docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
+test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
+
 [[package]]
 name = "wcwidth"
 version = "0.2.6"
@@ -3157,4 +3261,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "2859497817dfd52518f4fa2ba527c716a5bb5e4354175f791b314e80a033edf2"
+content-hash = "b9985d182b0c350a39e12aeae274f2e809d1454f47b58b2d2a5fe8b8264418b7"
diff --git a/pyproject.toml b/pyproject.toml
index 9d8a99c..15a3b9b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,7 @@ polars = "^0.15.0"
 zstandard = "^0.18.0"
 pgzip = "^0.3.2"
 scikit-learn = "^1.2.1"
+pre-commit = "^3.5.0"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"

From a947a39a87f801eb116bd33b20ea5a96261052f5 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 1 Nov 2023 16:27:20 +0000
Subject: [PATCH 08/40] complain when linting fails

---
 .pre-commit-config.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6e4ae2c..98d8851 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,4 +3,6 @@ repos:
   # Ruff version.
   rev: v0.1.3
   hooks:
-    - id: ruff-format
\ No newline at end of file
+    - id: ruff-format
+    - id: ruff
+      args: [--fix, --exit-non-zero-on-fix]    
\ No newline at end of file

From 32ef39c3befb3b3cb0b9986024ee7c5467b88a37 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 1 Nov 2023 16:30:03 +0000
Subject: [PATCH 09/40] fix linting

---
 .pre-commit-config.yaml                | 4 ++--
 pgscatalog_utils/scorefile/liftover.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 98d8851..f7d0c74 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,6 +3,6 @@ repos:
   # Ruff version.
   rev: v0.1.3
   hooks:
-    - id: ruff-format
     - id: ruff
-      args: [--fix, --exit-non-zero-on-fix]    
\ No newline at end of file
+      args: [--fix, --exit-non-zero-on-fix]  
+    - id: ruff-format
diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py
index 9924916..7e35b23 100644
--- a/pgscatalog_utils/scorefile/liftover.py
+++ b/pgscatalog_utils/scorefile/liftover.py
@@ -45,7 +45,7 @@ def liftover(
             n += 1
 
         if (n_lifted / n) < Config.min_lift:
-            logger.error(f"Liftover failed")
+            logger.error("Liftover failed")
             raise Exception
         else:
             logger.info("Liftover successful")

From eb9d362b788b7095d251c2ee668a93f642d453e7 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 2 Nov 2023 12:30:17 +0000
Subject: [PATCH 10/40] support wide files

---
 pgscatalog_utils/scorefile/header.py      | 17 ++---
 pgscatalog_utils/scorefile/qc.py          |  9 ++-
 pgscatalog_utils/scorefile/scoringfile.py | 93 ++++++++++++++++-------
 3 files changed, 82 insertions(+), 37 deletions(-)

diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py
index 3a5e889..0397754 100644
--- a/pgscatalog_utils/scorefile/header.py
+++ b/pgscatalog_utils/scorefile/header.py
@@ -24,31 +24,30 @@ class ScoringFileHeader:
     citation: str
 
     def __post_init__(self):
-        self.variants_number = int(self.variants_number)
+        if self.variants_number:
+            self.variants_number = int(self.variants_number)
+
         self.genome_build = GenomeBuild.from_string(self.genome_build)
         if self.HmPOS_build:
             self.HmPOS_build = GenomeBuild.from_string(self.HmPOS_build)
 
-        if self.format_version != "2.0":
-            raise Exception("Only support v2 format")
-
     @classmethod
     def from_path(cls, path: pathlib.Path):
         raw_header: dict = raw_header_to_dict(read_header(path))
-        # only keep keys needed by class (intersect)
+        # only keep keys needed by class but support partial headers with None values
         keep_keys = ScoringFileHeader.__annotations__.keys()
-        header_dict = {k: raw_header[k] for k in raw_header.keys() & keep_keys}
+        header_dict = {k: raw_header.get(k) for k in keep_keys}
         # ... so we can unpack the dict into a dataclass
 
-        if len(header_dict) > 1 and "HmPOS_build" not in header_dict:
+        if "HmPOS_build" not in header_dict:
             # working with pgs catalog formatted header but unharmonised data
             header_dict["HmPOS_build"] = None
 
-        if header_dict:
+        if not all([v is None for _, v in header_dict.items()]):
             return ScoringFileHeader(**header_dict)
         else:
             # no header available
-            return None
+            raise Exception("No header detected in scoring file")
 
 
 def raw_header_to_dict(header):
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 44d907f..3513903 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -8,7 +8,7 @@
 logger = logging.getLogger(__name__)
 
 
-def quality_control(variants, header: ScoringFileHeader, harmonised: bool):
+def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide: bool):
     variants = remap_harmonised(variants, harmonised)
 
     if Config.drop_missing:
@@ -17,6 +17,13 @@ def quality_control(variants, header: ScoringFileHeader, harmonised: bool):
     variants = assign_effect_type(variants)
     variants = check_effect_weight(variants)
     variants = assign_other_allele(variants)
+
+    if wide:
+        # wide data must be sorted because:
+        # - check_duplicates requires sorted input
+        # - output would be unsorted, which looks a little bit messy
+        variants = (x for x in sorted(variants, key=lambda x: x["accession"]))
+
     variants = check_duplicates(variants)
 
     if Config.liftover:
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index e1aea8f..0908576 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -54,16 +54,18 @@ def from_path(cls, path: pathlib.Path):
             genome_build = None
 
         start_line, cols = get_columns(path)
+        is_wide = detect_wide(cols)
 
-        # generate variants (a list of dicts, one for each variants)
         logger.info(f"Lazily reading variants from {path}")
         variants = ScoringFile.read_variants(
-            path=path, start_line=start_line, fields=cols, name=name
+            path=path, start_line=start_line, fields=cols, name=name, is_wide=is_wide
         )
 
         # note: these generator expressions aren't doing a bunch of iterations
         # it's just a data processing pipeline
-        variants = quality_control(variants, header=header, harmonised=harmonised)
+        variants = quality_control(
+            variants, header=header, harmonised=harmonised, wide=is_wide
+        )
 
         return cls(
             path=path,
@@ -76,10 +78,10 @@ def from_path(cls, path: pathlib.Path):
         )
 
     @staticmethod
-    def read_variants(path, fields, start_line, name: str):
+    def read_variants(path, fields, start_line, name: str, is_wide: bool):
         open_function = auto_open(path)
+        row_nr = 0
         with open_function(path, mode="rt") as f:
-            row_nr = 0  # row_nr
             for _ in range(start_line + 1):
                 # skip header
                 next(f)
@@ -90,27 +92,46 @@ def read_variants(path, fields, start_line, name: str):
                     break
 
                 csv_reader = csv.reader(batch, delimiter="\t")
-                for row in csv_reader:
-                    variant = dict(zip(fields, row)) | {
-                        "accession": name,
-                        "row_nr": row_nr,
-                    }
-                    keys = [
-                        "chr_name",
-                        "chr_position",
-                        "effect_allele",
-                        "other_allele",
-                        "effect_weight",
-                        "hm_chr",
-                        "hm_pos",
-                        "hm_inferOtherAllele",
-                        "is_dominant",
-                        "is_recessive",
-                        "accession",
-                        "row_nr",
-                    ]
-                    yield {k: variant[k] for k in keys if k in variant}
-                    row_nr += 1
+                yield from read_rows(csv_reader, fields, name, row_nr, is_wide)
+
+
+def read_rows(csv_reader, fields: list[str], name: str, row_nr: int, wide: bool):
+    for row in csv_reader:
+        variant = dict(zip(fields, row))
+
+        if wide:
+            ew_col_idxs: list[int] = [
+                i for i, x in enumerate(["effect_weight_" in x for x in fields]) if x
+            ]
+            for i, weight_name in zip(ew_col_idxs, [fields[i] for i in ew_col_idxs]):
+                keys = ["chr_name", "chr_position", "effect_allele", "other_allele"]
+                yield {k: variant[k] for k in keys if k in variant} | {
+                    "accession": weight_name,
+                    "row_nr": row_nr,
+                    "effect_weight": variant[weight_name],
+                }
+        else:
+            keys = [
+                "chr_name",
+                "chr_position",
+                "effect_allele",
+                "other_allele",
+                "effect_weight",
+                "hm_chr",
+                "hm_pos",
+                "hm_inferOtherAllele",
+                "is_dominant",
+                "is_recessive",
+                "accession",
+                "row_nr",
+            ]
+
+            yield {k: variant[k] for k in keys if k in variant} | {
+                "accession": name,
+                "row_nr": row_nr,
+            }
+
+        row_nr += 1
 
 
 def get_columns(path) -> tuple[int, list[str]]:
@@ -119,4 +140,22 @@ def get_columns(path) -> tuple[int, list[str]]:
         for i, line in enumerate(f):
             if line.startswith("#"):
                 continue
-            return i, line.strip().split("\t")
+            line_no, cols = i, line.strip().split("\t")
+            if len(set(cols)) != len(cols):
+                logger.critical(f"Duplicated column names: {cols}")
+                raise ValueError
+
+            return line_no, cols
+
+
+def detect_wide(cols: list[str]) -> bool:
+    """
+    Check columns to see if multiple effect weights are present. Multiple effect weights must be present in the form:
+    effect_weight_suffix1
+    effect_weight_suffix2
+    """
+    if any(["effect_weight_" in x for x in cols]):
+        logger.info("Wide scoring file detected with multiple effect weights")
+        return True
+    else:
+        return False

From 1774e01c575cf135270ec991ea1677e12f113a71 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 2 Nov 2023 15:55:08 +0000
Subject: [PATCH 11/40] add log

---
 pgscatalog_utils/download/GenomeBuild.py      | 17 ++++++-----
 .../scorefile/combine_scorefiles.py           | 11 ++++++-
 pgscatalog_utils/scorefile/header.py          | 12 ++++----
 pgscatalog_utils/scorefile/scoringfile.py     | 29 ++++++++++++++++++-
 pgscatalog_utils/scorefile/write.py           |  5 ++++
 5 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/pgscatalog_utils/download/GenomeBuild.py b/pgscatalog_utils/download/GenomeBuild.py
index 893bf97..23c8984 100644
--- a/pgscatalog_utils/download/GenomeBuild.py
+++ b/pgscatalog_utils/download/GenomeBuild.py
@@ -1,18 +1,21 @@
-from enum import Enum, auto
+from enum import Enum
 
 
 class GenomeBuild(Enum):
-    GRCh37 = auto()
-    GRCh38 = auto()
+    GRCh37 = "GRCh37"
+    GRCh38 = "GRCh38"
+
+    def __str__(self):
+        return str(self.value)
 
     @classmethod
     def from_string(cls, build):
         match build:
-            case 'GRCh37' | 'hg18':
+            case "GRCh37" | "hg18":
                 return cls(GenomeBuild.GRCh37)
-            case 'GRCh38' | 'hg19':
+            case "GRCh38" | "hg19":
                 return cls(GenomeBuild.GRCh38)
-            case 'NR':
+            case "NR":
                 return None
             case _:
-                raise Exception
\ No newline at end of file
+                raise Exception
diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index e3ce606..8e3bea8 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -1,4 +1,5 @@
 import argparse
+import json
 import logging
 import sys
 import textwrap
@@ -44,7 +45,15 @@ def combine_scorefiles():
     else:
         logger.info(f"All builds match target build {target_build}")
 
-    write_combined(sfs, args.outfile)
+    line_counts: dict[str, int] = write_combined(sfs, args.outfile)
+    # provide line counts when making the scoring files
+    log = []
+    for (k, v), sf in zip(line_counts.items(), sfs):
+        log.append(sf.generate_log(v))
+
+    with open(args.logfile, "w") as f:
+        logger.info(f"Writing log to {f.name}")
+        json.dump(log, f, indent=4)
 
     end_time = time.time()
     elapsed_time = end_time - start_time
diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py
index 0397754..e9a03e4 100644
--- a/pgscatalog_utils/scorefile/header.py
+++ b/pgscatalog_utils/scorefile/header.py
@@ -13,15 +13,17 @@
 class ScoringFileHeader:
     pgs_id: str
     pgp_id: str
-    trait_efo: str
-    trait_reported: str
-    trait_mapped: str
     pgs_name: str
     genome_build: GenomeBuild
-    HmPOS_build: GenomeBuild
     variants_number: int
-    format_version: str
+    trait_reported: str
+    trait_efo: str
+    trait_mapped: str
+    weight_type: str
     citation: str
+    HmPOS_build: GenomeBuild
+    HmPOS_date: str
+    format_version: str
 
     def __post_init__(self):
         if self.variants_number:
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index 0908576..c879b85 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -77,10 +77,36 @@ def from_path(cls, path: pathlib.Path):
             accession=name,
         )
 
+    def generate_log(self, line_count: int):
+        log = {
+            key: str(value) if value is not None else None
+            for key, value in self.header.__dict__.items()
+        }
+
+        if log["variants_number"] is None:
+            # custom scoring files might not have this information
+            log["variants_number"] = line_count
+
+        # multiple terms may be separated with a pipe
+        if log["trait_mapped"]:
+            log["trait_mapped"] = log["trait_mapped"].split("|")
+
+        if log["trait_efo"]:
+            log["trait_efo"] = log["trait_efo"].split("|")
+
+        log["columns"] = self.fields
+        log["use_liftover"] = Config.liftover
+        log["use_harmonised"] = self.harmonised
+
+        return {self.accession: log}
+
     @staticmethod
     def read_variants(path, fields, start_line, name: str, is_wide: bool):
         open_function = auto_open(path)
-        row_nr = 0
+        # row_nr and cum_batch are equivalent but
+        row_nr = 0  # important to increment in sub-generator for each line
+        cum_batch = 0  # sums batches in this function
+
         with open_function(path, mode="rt") as f:
             for _ in range(start_line + 1):
                 # skip header
@@ -88,6 +114,7 @@ def read_variants(path, fields, start_line, name: str, is_wide: bool):
 
             while True:
                 batch = list(islice(f, Config.batch_size))
+                cum_batch += len(batch)
                 if not batch:
                     break
 
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 8a31fb6..d0a32bb 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -42,6 +42,7 @@ def write_combined(scoring_files: list[ScoringFile], out_path: str):
         )
         writer.writeheader()
 
+        line_counts = {}
         # write out in batches for compression efficiency and speed
         for scoring_file in scoring_files:
             logger.info(f"Writing {scoring_file.accession} variants")
@@ -49,4 +50,8 @@ def write_combined(scoring_files: list[ScoringFile], out_path: str):
                 batch = list(islice(scoring_file.variants, Config.batch_size))
                 if not batch:
                     break
+                # calculate max row_nr now because it's when we finally generate variants
+                line_counts[scoring_file.accession] = max(x["row_nr"] for x in batch)
                 writer.writerows(batch)
+
+        return line_counts

From 7cf956291f0f76b7ebf1339a42593e89791d374d Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 3 Nov 2023 12:25:34 +0000
Subject: [PATCH 12/40] fix tests and liftover

---
 conftest.py                                   | 180 ++--
 .../scorefile/combine_scorefiles.py           |   4 +-
 pgscatalog_utils/scorefile/liftover.py        |  12 +-
 pgscatalog_utils/scorefile/qc.py              |  10 +-
 pgscatalog_utils/scorefile/scoringfile.py     |   2 +-
 tests/data/combine/PGS001229_22.txt           | 850 ++++++++++++++++++
 tests/data/combine/scorefile.txt              | 838 +++++++++++++++++
 .../scorefile_dominant_and_recessive.txt      | 838 +++++++++++++++++
 tests/test_combine.py                         | 144 ++-
 tests/test_liftover.py                        |  42 +-
 10 files changed, 2783 insertions(+), 137 deletions(-)
 create mode 100644 tests/data/combine/PGS001229_22.txt
 create mode 100644 tests/data/combine/scorefile.txt
 create mode 100644 tests/data/combine/scorefile_dominant_and_recessive.txt

diff --git a/conftest.py b/conftest.py
index a4a55c6..ba3e065 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1,11 +1,9 @@
 import glob
 import importlib.resources
 import os
-import pathlib
 import shutil
 from unittest.mock import patch
 
-import pandas as pd
 import polars as pl
 import pytest
 import requests as req
@@ -14,20 +12,51 @@
 from pgscatalog_utils.match.preprocess import complement_valid_alleles
 from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
 
+from tests.data import combine
+
 pl.toggle_string_cache(True)
 
 
 @pytest.fixture(scope="session")
 def pgs_accessions():
-    return ['PGS001229', 'PGS000922']
+    return ["PGS001229", "PGS000922"]
+
+
+@pytest.fixture(scope="session")
+def mini_score_path(tmp_path_factory):
+    path = importlib.resources.files(combine) / "PGS001229_22.txt"
+    return path
+
+
+@pytest.fixture(scope="session")
+def mini_scorefile(mini_score_path, tmp_path_factory):
+    # The mini scorefile overlaps well with cineca synthetic subset
+    out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
+    args: list[str] = (
+        ["combine_scorefiles", "-t", "GRCh37", "-s"]
+        + [mini_score_path]
+        + ["-o", str(out_path.resolve())]
+    )
+
+    with patch("sys.argv", args):
+        combine_scorefiles()
+
+    return str(out_path.resolve())
 
 
 @pytest.fixture(scope="session")
 def scorefiles(tmp_path_factory, pgs_accessions):
     fn = tmp_path_factory.mktemp("scorefiles")
-    args: list[str] = ['download_scorefiles', '-b', 'GRCh37', '-o', str(fn.resolve()), '-i'] + pgs_accessions
-
-    with patch('sys.argv', args):
+    args: list[str] = [
+        "download_scorefiles",
+        "-b",
+        "GRCh37",
+        "-o",
+        str(fn.resolve()),
+        "-i",
+    ] + pgs_accessions
+
+    with patch("sys.argv", args):
         download_scorefile()
 
     return glob.glob(os.path.join(fn.resolve(), "*.txt.gz"))
@@ -37,8 +66,9 @@ def scorefiles(tmp_path_factory, pgs_accessions):
 def target_path(tmp_path_factory):
     try:
         bim = req.get(
-            'https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim',
-            timeout=5)
+            "https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim",
+            timeout=5,
+        )
     except (req.exceptions.ConnectionError, req.Timeout):
         bim = []
 
@@ -46,129 +76,85 @@ def target_path(tmp_path_factory):
         pytest.skip("Couldn't get test data from network")
     else:
         fn = tmp_path_factory.mktemp("target") / "data.bim"
-        with open(fn, 'wb') as f:
+        with open(fn, "wb") as f:
             f.write(bim.content)
 
         return str(fn.resolve())
 
 
-@pytest.fixture(scope="session")
-def mini_score_path(tmp_path_factory):
-    try:
-        score = req.get('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/PGS001229_22.txt',
-                        timeout=5)
-    except (req.exceptions.ConnectionError, req.Timeout):
-        score = []
-
-    if not score:
-        pytest.skip("Couldn't get test data from network")
-    else:
-        fn = tmp_path_factory.mktemp("score") / "PGS001229_22.txt"
-        with open(fn, 'wb') as f:
-            f.write(score.content)
-
-        return str(fn.resolve())
-
-
-@pytest.fixture(scope="session")
-def mini_scorefile(mini_score_path, tmp_path_factory):
-    # The mini scorefile overlaps well with cineca synthetic subset
-    out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
-    args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())]
-
-    with patch('sys.argv', args):
-        combine_scorefiles()
-
-    return str(out_path.resolve())
-
-
-@pytest.fixture(scope="session")
-def combined_scorefile(scorefiles, tmp_path_factory):
-    # The combined scorefile overlaps poorly with cineca synthetic subset
-    out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
-    args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + scorefiles + ['-o', str(out_path.resolve())]
-
-    with patch('sys.argv', args):
-        combine_scorefiles()
-
-    return str(out_path.resolve())
-
-
 @pytest.fixture(scope="session")
 def chain_files(tmp_path_factory):
-    chain_dir = tmp_path_factory.mktemp('chain_dir')
+    chain_dir = tmp_path_factory.mktemp("chain_dir")
 
     shutil.copy2("tests/data/hg19ToHg38.over.chain.gz", chain_dir)
     shutil.copy2("tests/data/hg38ToHg19.over.chain.gz", chain_dir)
-    
-    return str(chain_dir.resolve())
-
-
-@pytest.fixture(scope="session")
-def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory):
-    out_path = tmp_path_factory.mktemp("scores") / "lifted.txt"
-    args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t',
-                                                                          'GRCh38',
-                                                                          '-m', '0.8'] + ['-o', str(out_path.resolve())]
-
-    with patch('sys.argv', args):
-        combine_scorefiles()
 
-    return str(out_path.resolve())
+    return str(chain_dir.resolve())
 
 
 @pytest.fixture(scope="session")
 def hg38_coords():
-    d = {'rsid': ['rs11903757', 'rs6061231'], 'chr_name': ['2', '20'], 'chr_position': [191722478, 62381861]}
-    df = pd.DataFrame(d)
-    df['accession'] = 'dummy'
-    df['genome_build'] = 'GRCh38'
-    return df
+    rs11903757 = {"rsid": "rs11903757", "chr_name": "2", "chr_position": 191722478}
+    rs6061231 = {"rsid": "rs6061231", "chr_name": "20", "chr_position": 62381861}
+    return [rs11903757, rs6061231]
 
 
 @pytest.fixture(scope="session")
-def hg19_coords(hg38_coords):
+def hg19_coords():
     # hg38_coords in GRCh37, from dbSNP
-    d = {'lifted_chr': ['2', '20'], 'lifted_pos': [192587204, 60956917], 'liftover': [True, True]}
-    return pd.DataFrame(d)
+    rs11903757 = {"rsid": "rs11903757", "chr_name": "2", "chr_position": 192587204}
+    rs6061231 = {"rsid": "rs6061231", "chr_name": "20", "chr_position": 60956917}
+    return [rs11903757, rs6061231]
 
 
-@pytest.fixture(scope='session')
+@pytest.fixture(scope="session")
 def small_flipped_scorefile(small_scorefile):
     # simulate a scorefile on the wrong strand
-    return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele'])
-            .drop(['effect_allele', 'other_allele'])
-            .rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'})
-            .pipe(complement_valid_alleles, ['effect_allele', 'other_allele']))
+    return (
+        complement_valid_alleles(small_scorefile, ["effect_allele", "other_allele"])
+        .drop(["effect_allele", "other_allele"])
+        .rename(
+            {"effect_allele_FLIP": "effect_allele", "other_allele_FLIP": "other_allele"}
+        )
+        .pipe(complement_valid_alleles, ["effect_allele", "other_allele"])
+    )
 
 
-@pytest.fixture(scope='session')
+@pytest.fixture(scope="session")
 def small_target():
-    return pl.DataFrame({"#CHROM": [1, 2, 3],
-                         "POS": [1, 2, 3],
-                         "REF": ["A", "T", "T"],
-                         "ALT": ["C", "A", "G"],
-                         "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
-                         "is_multiallelic": [False, False, False]})
+    return pl.DataFrame(
+        {
+            "#CHROM": [1, 2, 3],
+            "POS": [1, 2, 3],
+            "REF": ["A", "T", "T"],
+            "ALT": ["C", "A", "G"],
+            "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
+            "is_multiallelic": [False, False, False],
+        }
+    )
 
 
-@pytest.fixture(scope='session')
+@pytest.fixture(scope="session")
 def small_scorefile():
-    df = pl.DataFrame({"accession": ["test", "test", "test"],
-                       "row_nr": [1, 2, 3],
-                       "chr_name": [1, 2, 3],
-                       "chr_position": [1, 2, 3],
-                       "effect_allele": ["A", "A", "G"],
-                       "other_allele": ["C", "T", "T"],
-                       "effect_weight": [1, 2, 3],
-                       "effect_type": ["additive", "additive", "additive"]})
+    df = pl.DataFrame(
+        {
+            "accession": ["test", "test", "test"],
+            "row_nr": [1, 2, 3],
+            "chr_name": [1, 2, 3],
+            "chr_position": [1, 2, 3],
+            "effect_allele": ["A", "A", "G"],
+            "other_allele": ["C", "T", "T"],
+            "effect_weight": [1, 2, 3],
+            "effect_type": ["additive", "additive", "additive"],
+        }
+    )
 
     return complement_valid_alleles(df, ["effect_allele", "other_allele"])
 
 
-@pytest.fixture(scope='session')
+@pytest.fixture(scope="session")
 def small_scorefile_no_oa(small_scorefile):
-    return small_scorefile.with_column(pl.lit(None).alias('other_allele'))
+    return small_scorefile.with_column(pl.lit(None).alias("other_allele"))
 
 
 def _get_timeout(url):
diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 8e3bea8..fbd3082 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -1,6 +1,7 @@
 import argparse
 import json
 import logging
+import pathlib
 import sys
 import textwrap
 import time
@@ -51,7 +52,8 @@ def combine_scorefiles():
     for (k, v), sf in zip(line_counts.items(), sfs):
         log.append(sf.generate_log(v))
 
-    with open(args.logfile, "w") as f:
+    log_out_path = pathlib.Path(args.outfile).parent / args.logfile
+    with open(log_out_path, "w") as f:
         logger.info(f"Writing log to {f.name}")
         json.dump(log, f, indent=4)
 
diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py
index 7e35b23..8097b70 100644
--- a/pgscatalog_utils/scorefile/liftover.py
+++ b/pgscatalog_utils/scorefile/liftover.py
@@ -20,13 +20,15 @@ def liftover(
         skip_lo = False
 
     if skip_lo:
+        logger.info("Skipping liftover")
         for variant in variants:
             yield variant
     else:
+        logger.info("Starting liftover")
         if current_build == GenomeBuild.GRCh37 and target_build == GenomeBuild.GRCh38:
             lo: pyliftover.LiftOver = Config.lo["hg19hg38"]
         elif current_build == GenomeBuild.GRCh38 and target_build == GenomeBuild.GRCh37:
-            lo: pyliftover.LiftOver = Config.lo["hg19hg38"]
+            lo: pyliftover.LiftOver = Config.lo["hg38hg19"]
         else:
             raise Exception("Can't get pyliftover object")
 
@@ -40,12 +42,16 @@ def liftover(
             if lifted:
                 variant["chr_name"] = lifted[0][0][3:].split("_")[0]
                 variant["chr_position"] = lifted[0][1] + 1  # reverse 0 indexing
+                variant["lifted"] = True
+                yield variant
                 n_lifted += 1
-            yield variant
+            else:
+                variant["lifted"] = False
+                yield variant
             n += 1
 
         if (n_lifted / n) < Config.min_lift:
-            logger.error("Liftover failed")
+            logger.error("Liftover failed for variant {variant}")
             raise Exception
         else:
             logger.info("Liftover successful")
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 3513903..8454fe4 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -117,14 +117,12 @@ def assign_effect_type(variants):
     for variant in variants:
         if "is_recessive" not in variant and "is_dominant" not in variant:
             variant["effect_type"] = "additive"
-
-        if "is_recessive" in variant or "is_dominant" in variant:
-            logger.info("Recessive or dominant variant detected")
-            if variant["is_recessive"]:
+        else:
+            if variant["is_recessive"] == "TRUE":
                 variant["effect_type"] = "recessive"
-            elif variant["is_dominant"]:
+            elif variant["is_dominant"] == "TRUE":
                 variant["effect_type"] = "dominant"
-            elif variant["is_recessive"] and variant["is_dominant"]:
+            elif variant["is_recessive"] == "TRUE" and variant["is_dominant"] == "TRUE":
                 logger.critical(f"Bad effect type setting: {variant}")
                 raise Exception
 
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index c879b85..1b44c6c 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -85,7 +85,7 @@ def generate_log(self, line_count: int):
 
         if log["variants_number"] is None:
             # custom scoring files might not have this information
-            log["variants_number"] = line_count
+            log["variants_number"] = line_count + 1  # (0 indexed)
 
         # multiple terms may be separated with a pipe
         if log["trait_mapped"]:
diff --git a/tests/data/combine/PGS001229_22.txt b/tests/data/combine/PGS001229_22.txt
new file mode 100644
index 0000000..5f791f4
--- /dev/null
+++ b/tests/data/combine/PGS001229_22.txt
@@ -0,0 +1,850 @@
+###PGS CATALOG SCORING FILE - see https://https://www.pgscatalog.org/downloads/#dl_ftp_scoring_scoring for additional information
+#format_version=2.0
+##POLYGENIC SCORE (PGS) INFORMATION
+#pgs_id=PGS001229
+#pgs_name=GBE_INI50
+#trait_reported=Standing height
+#trait_mapped=body height
+#trait_efo=EFO_0004339
+#weight_type=NR
+#genome_build=GRCh37
+#variants_number=51209
+##SOURCE INFORMATION
+#pgp_id=PGP000244
+#citation=Tanigawa Y et al. medRxiv (2021). doi:10.1101/2021.09.02.21262942
+rsID	chr_name	chr_position	effect_allele	other_allele	effect_weight	is_haplotype	imputation_method	locus_name	variant_description
+rs5746679	22	17080378	G	A	1.045457e-02	False			
+rs2192430	22	17300230	A	G	1.411475e-04	False			
+rs165636	22	17318864	A	C	8.166266e-03	False			
+rs165808	22	17327595	T	C	7.791641e-03	False			
+rs5748743	22	17409813	A	G	3.108784e-04	False			
+rs11703655	22	17450952	G	A	-3.033983e-02	False			
+rs2192155	22	17492533	G	A	3.889990e-03	False			
+rs2845402	22	17542810	C	T	8.036290e-03	False			
+rs4819958	22	17565013	G	A	2.135621e-02	False			
+rs879577	22	17589209	T	C	3.026491e-03	False			
+rs5994165	22	17600977	A	G	1.581277e-02	False			
+rs35665085	22	17625915	A	G	-1.172964e-01	False			
+rs1034859	22	17630486	A	C	1.012909e-02	False			
+rs738032	22	17633785	C	T	2.325500e-03	False			
+rs5994185	22	17643689	A	G	3.361814e-03	False			
+rs2231495	22	17669306	C	T	2.145060e-02	False			
+rs5747018	22	17677699	T	C	-7.031384e-04	False			
+rs17807317	22	17680519	C	A	1.079236e-03	False			
+rs9606655	22	17701234	G	A	4.477145e-03	False			
+rs78718739	22	17703119	A	T	7.771872e-04	False			
+rs73153427	22	17718699	C	A	-1.320632e-02	False			
+rs4266110	22	17721595	C	T	9.480363e-03	False			
+rs4819982	22	17727648	T	C	7.811685e-03	False			
+rs5749000	22	17738177	G	A	-4.719812e-03	False			
+rs5749002	22	17749096	A	G	-5.244795e-03	False			
+rs11704699	22	17770181	G	T	-3.101703e-02	False			
+rs5749032	22	17793969	G	A	1.774444e-02	False			
+rs5994272	22	17815696	G	C	-5.516090e-03	False			
+rs4820001	22	17827684	G	A	-5.944752e-03	False			
+rs2040692	22	17831813	T	C	1.061587e-02	False			
+rs9606701	22	17844929	T	G	1.717643e-03	False			
+rs73391753	22	17850661	T	C	-2.805489e-02	False			
+rs4819581	22	17887534	A	G	7.723542e-04	False			
+rs2522310	22	17887725	A	G	7.472703e-03	False			
+rs5747199	22	17958221	C	A	-2.098647e-02	False			
+rs174346	22	18036253	G	A	-1.772981e-02	False			
+rs174351	22	18038786	A	G	-2.119071e-03	False			
+rs9605406	22	18262301	A	T	-5.065485e-03	False			
+rs389496	22	18289204	A	G	5.306345e-03	False			
+rs399757	22	18295575	C	T	2.300129e-02	False			
+rs1550663	22	18296238	G	A	-5.665446e-03	False			
+rs439231	22	18319179	T	C	3.440642e-02	False			
+rs2401424	22	18393534	A	C	1.764269e-02	False			
+rs1076540	22	18439958	T	C	-2.261707e-03	False			
+rs4819654	22	18483388	G	A	3.318724e-02	False			
+rs9617650	22	18488883	C	G	-1.919180e-02	False			
+rs397709	22	18489048	C	A	1.233198e-02	False			
+rs452579	22	18495470	A	G	-5.804926e-03	False			
+rs1992576	22	18537145	G	A	-4.930116e-03	False			
+rs464385	22	18571008	A	G	-8.844726e-05	False			
+rs3827281	22	18584433	C	T	-1.169893e-03	False			
+rs9618216	22	18631365	T	C	-1.551714e-03	False			
+rs3180408	22	18650682	T	C	-1.313784e-02	False			
+rs2010694	22	18890037	A	G	5.968921e-02	False			
+rs454534	22	18891398	G	A	6.891943e-03	False			
+rs2080346	22	18892575	A	G	-2.244470e-03	False			
+rs2016108	22	18915963	A	G	3.719756e-03	False			
+rs2518810	22	18959581	T	C	6.464581e-03	False			
+rs2019061	22	18963340	A	G	-1.397565e-02	False			
+rs11089247	22	18970915	T	C	-1.507131e-03	False			
+rs2073776	22	19024651	T	C	-3.505750e-03	False			
+rs712965	22	19121872	A	G	1.644046e-02	False			
+rs2275901	22	19135603	A	G	-2.970077e-02	False			
+rs11089259	22	19190143	T	C	3.268027e-03	False			
+rs361787	22	19263698	T	C	2.057255e-02	False			
+rs8135222	22	19292446	G	T	1.153989e-02	False			
+rs34292276	22	19371052	T	C	1.055134e-02	False			
+rs1128399	22	19420109	C	T	-8.628228e-03	False			
+rs5748218	22	19451186	A	C	2.141029e-02	False			
+rs5748260	22	19518079	C	T	5.372247e-03	False			
+rs5993713	22	19581331	T	C	1.686942e-02	False			
+rs9606090	22	19593854	C	A	6.544249e-04	False			
+rs8135254	22	19606703	G	A	2.070121e-02	False			
+rs9617823	22	19649005	A	G	2.868601e-03	False			
+rs6518580	22	19735854	C	T	6.262962e-03	False			
+rs1005133	22	19738355	T	C	4.973840e-05	False			
+rs9680615	22	19770886	A	G	-1.013929e-02	False			
+rs2871043	22	19781823	T	C	2.481609e-02	False			
+rs2073750	22	19873357	T	C	1.163020e-02	False			
+rs5748469	22	19907099	A	C	-2.676450e-02	False			
+rs9618723	22	19968597	T	C	-2.203945e-02	False			
+rs5748515	22	20046344	G	A	-9.801428e-03	False			
+rs59528277	22	20084821	C	T	-2.232886e-02	False			
+rs625704	22	20185457	A	G	6.892171e-03	False			
+rs672570	22	20189077	T	C	1.738215e-02	False			
+rs7293032	22	20219648	A	G	9.307625e-03	False			
+rs855050	22	20248391	A	G	-5.405845e-03	False			
+rs855061	22	20267213	A	G	6.713242e-03	False			
+rs741413	22	20286099	G	T	1.574758e-02	False			
+rs35012563	22	20749042	G	A	6.603339e-03	False			
+rs361860	22	20754039	A	G	-1.181141e-02	False			
+rs1771145	22	20775167	T	C	1.160113e-02	False			
+rs9680797	22	20780296	A	G	6.735311e-02	False			
+rs1005640	22	20789074	C	T	2.844307e-02	False			
+rs12628193	22	20791438	A	C	4.734740e-02	False			
+rs1035239	22	20793914	C	T	7.009781e-03	False			
+rs75179603	22	20839810	T	G	3.947346e-03	False			
+rs738092	22	20860931	T	C	5.613511e-04	False			
+rs10427922	22	20979980	G	A	3.231665e-03	False			
+rs2080195	22	20991771	G	A	4.226765e-03	False			
+rs5751800	22	21075537	C	A	-2.096453e-03	False			
+rs361979	22	21154393	G	T	-4.297086e-03	False			
+rs756878	22	21323357	C	T	-6.041745e-03	False			
+rs178275	22	21331918	G	C	-2.280912e-03	False			
+rs105034	22	21334924	C	G	-2.031369e-02	False			
+rs28372939	22	21356824	A	G	1.476577e-02	False			
+rs2072550	22	21386019	A	G	1.435557e-02	False			
+rs431319	22	21449028	G	A	-1.537701e-02	False			
+rs2845419	22	21463515	A	G	-1.335614e-02	False			
+rs2298428	22	21982892	T	C	-6.373335e-02	False			
+rs62235077	22	22001704	T	G	2.809584e-02	False			
+rs76940365	22	22062480	T	C	5.291130e-02	False			
+rs10427813	22	22080735	G	A	-1.394260e-02	False			
+rs78907487	22	22151939	C	A	-8.287849e-03	False			
+rs9607287	22	22163425	G	A	5.518983e-02	False			
+rs412050	22	22307519	C	G	-3.486191e-03	False			
+rs79165737	22	22351283	G	A	-7.483763e-04	False			
+rs5844480	22	22394291	AG	A	4.320583e-03	False			
+rs2213141	22	22395754	T	C	2.587971e-03	False			
+rs6519111	22	22424302	A	C	1.140800e-03	False			
+rs77010661	22	22473905	C	A	1.226009e-02	False			
+rs2073447	22	22550450	G	C	1.773244e-02	False			
+rs5757417	22	22561610	C	T	-6.207024e-03	False			
+rs6001482	22	22581369	G	A	-6.272413e-03	False			
+rs5757569	22	22584678	A	G	-2.176470e-03	False			
+rs736898	22	22711786	T	C	7.779875e-03	False			
+rs738881	22	22726372	T	C	3.496320e-03	False			
+rs2051490	22	22762771	C	T	1.252501e-02	False			
+rs433766	22	22769923	G	A	-1.103632e-02	False			
+rs361959	22	22869742	A	C	-2.412657e-03	False			
+rs362168	22	22871922	A	G	-2.769974e-03	False			
+rs4462880	22	22929268	T	C	-7.035723e-03	False			
+rs456455	22	23001481	A	G	7.524178e-03	False			
+rs11703025	22	23022520	T	C	2.175257e-03	False			
+rs10854762	22	23064982	A	C	-1.255076e-02	False			
+rs2856876	22	23249440	A	C	2.085816e-02	False			
+rs58555503	22	23268677	A	G	1.337349e-02	False			
+rs17514179	22	23279456	C	G	-1.371401e-02	False			
+rs468884	22	23282286	C	T	4.994329e-03	False			
+rs9623992	22	23325722	C	T	8.506657e-04	False			
+rs3788338	22	23412058	A	G	-9.545553e-03	False			
+rs140504	22	23627369	G	A	-1.900175e-02	False			
+rs12168342	22	23644425	G	A	-9.106953e-04	False			
+rs131693	22	23649242	G	T	1.061643e-03	False			
+rs3827368	22	23794844	G	A	-1.198736e-02	False			
+rs11090252	22	23804670	G	T	-1.119846e-03	False			
+rs2330498	22	23819697	T	G	-1.028722e-02	False			
+rs5759884	22	23873076	T	C	9.509027e-03	False			
+rs179303	22	23892145	T	C	1.351280e-02	False			
+rs131429	22	23925779	C	T	-4.127647e-03	False			
+rs6003815	22	23960187	T	C	-8.475905e-03	False			
+rs2070446	22	24035970	T	C	-1.334318e-03	False			
+rs5759985	22	24086107	G	A	-1.652957e-02	False			
+rs73396542	22	24105789	A	G	1.813091e-02	False			
+rs2298375	22	24106448	A	G	1.834095e-03	False			
+rs6003915	22	24186809	C	T	-1.426541e-02	False			
+rs4822446	22	24235360	G	A	3.168635e-04	False			
+rs4822455	22	24255296	T	C	1.624252e-02	False			
+rs144128236	22	24300540	T	C	-3.225760e-03	False			
+rs144686326	22	24376584	A	G	-6.223068e-03	False			
+rs422674	22	24406778	A	C	3.046540e-03	False			
+rs5996675	22	24618331	G	A	-6.506681e-04	False			
+rs5751862	22	24802564	A	G	-6.695797e-03	False			
+rs6004171	22	24912232	T	C	-1.536303e-02	False			
+rs762283	22	24943582	A	G	-1.687764e-03	False			
+rs2006092	22	24995668	G	A	-3.537331e-02	False			
+rs5760609	22	25123505	C	T	-1.600990e-02	False			
+rs5760620	22	25145094	T	C	-5.584047e-03	False			
+rs1892723	22	25145453	T	C	-1.388536e-03	False			
+rs5760661	22	25185823	A	G	-9.228375e-03	False			
+rs11703103	22	25265972	A	G	1.088906e-02	False			
+rs139766	22	25309448	A	G	-2.238693e-03	False			
+rs5752027	22	25363411	A	G	4.035775e-03	False			
+rs34259162	22	25410895	G	A	9.720734e-04	False			
+rs16979472	22	25442369	C	T	1.660527e-02	False			
+rs9612844	22	25454658	C	A	1.200285e-02	False			
+rs6004418	22	25465065	C	T	1.320801e-02	False			
+rs4627697	22	25524916	C	T	1.147501e-02	False			
+rs13055430	22	25603008	T	C	-1.262741e-02	False			
+rs7286982	22	25619025	G	T	-1.212511e-02	False			
+rs5752084	22	25621591	T	C	1.051851e-02	False			
+rs11703955	22	25643483	T	G	1.373474e-02	False			
+rs9612921	22	25661725	A	G	-5.936431e-03	False			
+rs6004519	22	25667883	G	A	1.547775e-02	False			
+rs5996879	22	25668730	A	C	2.616493e-02	False			
+rs67839603	22	25678577	T	C	3.040180e-02	False			
+rs79854676	22	25761309	T	C	-1.760112e-03	False			
+rs713847	22	25761936	T	C	-5.171998e-03	False			
+rs571663	22	25938977	T	C	1.966116e-02	False			
+rs1008673	22	25994013	A	G	6.268228e-04	False			
+rs718163	22	26081873	T	C	5.232603e-02	False			
+rs10212011	22	26132612	A	G	-6.457239e-03	False			
+rs133847	22	26133775	T	C	-1.181527e-03	False			
+rs133885	22	26159289	A	G	-8.399401e-03	False			
+rs3859870	22	26181767	C	T	1.044769e-02	False			
+rs5761201	22	26190915	G	A	4.287533e-03	False			
+rs5761256	22	26218164	G	A	-2.803502e-03	False			
+rs17704912	22	26231312	C	G	6.105629e-03	False			
+rs2269632	22	26237826	C	T	4.981479e-03	False			
+rs5761268	22	26239850	A	C	4.144037e-03	False			
+rs4822668	22	26273893	C	G	5.616213e-03	False			
+rs695809	22	26278128	G	T	-3.965338e-03	False			
+rs2157538	22	26280462	T	C	-8.324497e-04	False			
+rs6004814	22	26290588	T	C	-1.307320e-02	False			
+rs973523	22	26292659	G	A	4.294309e-05	False			
+rs2072006	22	26343593	G	A	7.813758e-03	False			
+rs9306419	22	26369358	T	C	-4.836650e-03	False			
+rs2331198	22	26390964	A	G	-7.849451e-03	False			
+rs5752254	22	26415475	T	C	-1.219281e-03	False			
+rs5752262	22	26456367	G	A	-1.285326e-02	False			
+rs56116806	22	26460519	T	C	-8.695338e-03	False			
+rs78711257	22	26528054	A	G	1.973023e-02	False			
+rs5752282	22	26617260	T	A	-1.384025e-02	False			
+rs4438594	22	26638906	G	T	1.229772e-02	False			
+rs5761484	22	26735648	A	G	7.879673e-04	False			
+rs5752316	22	26782251	G	A	5.096459e-04	False			
+rs7289238	22	26812632	C	T	-1.850814e-02	False			
+rs732933	22	26939781	C	T	-9.222796e-04	False			
+rs2267091	22	26960648	A	C	-5.679255e-03	False			
+rs5752371	22	27038865	T	G	-1.487706e-04	False			
+rs5752372	22	27042828	A	G	2.957737e-02	False			
+rs1476035	22	27161060	A	G	2.844558e-03	False			
+rs56278657	22	27191643	T	C	8.953731e-03	False			
+rs739226	22	27216426	G	A	9.120990e-03	False			
+rs4822804	22	27217018	A	G	1.510616e-02	False			
+rs136511	22	27240025	T	G	-2.971740e-02	False			
+rs136516	22	27242642	G	A	-9.822927e-03	False			
+rs136535	22	27246070	C	T	-1.554199e-03	False			
+rs9306427	22	27252454	C	T	-6.560251e-03	False			
+rs5761797	22	27264880	G	T	-1.323094e-02	False			
+rs4822824	22	27337886	A	G	-9.600014e-03	False			
+rs739257	22	27339284	T	C	-9.944488e-03	False			
+rs5761864	22	27353810	T	C	-2.171555e-03	False			
+rs5761885	22	27370273	T	C	-9.798478e-03	False			
+rs7288253	22	27378884	A	G	5.145072e-02	False			
+rs7287426	22	27398749	C	T	1.012263e-03	False			
+rs9613339	22	27403571	C	T	-1.745865e-02	False			
+rs2516086	22	27405012	T	C	-5.425419e-03	False			
+rs17343637	22	27415255	C	T	-1.499362e-02	False			
+rs60259956	22	27426628	G	C	2.289460e-02	False			
+rs9620654	22	27430724	A	G	-7.068064e-03	False			
+rs760526	22	27435577	C	T	-8.632412e-03	False			
+rs4822847	22	27487580	G	A	3.691502e-03	False			
+rs5761976	22	27498426	A	G	-6.801544e-03	False			
+rs11704703	22	27526095	G	A	-8.086267e-04	False			
+rs9625170	22	27563274	C	A	1.369650e-02	False			
+rs9306437	22	27584680	A	G	-2.139188e-03	False			
+rs546339	22	27628151	C	G	2.130389e-02	False			
+rs134786	22	27652290	T	G	4.815735e-03	False			
+rs760593	22	27660675	A	G	4.899654e-03	False			
+rs134810	22	27674832	G	T	1.248065e-04	False			
+rs736950	22	27718775	A	G	2.292384e-02	False			
+rs568561	22	27729742	G	A	4.951261e-03	False			
+rs6519705	22	27762155	C	T	4.856660e-03	False			
+rs6005412	22	27781736	A	C	-8.336242e-03	False			
+rs5752545	22	27829565	G	A	2.854090e-03	False			
+rs16984654	22	27832985	G	C	-1.668955e-02	False			
+rs4822878	22	27836311	G	A	-7.756250e-03	False			
+rs7288006	22	27839704	T	C	-2.492106e-02	False			
+rs5762173	22	27864471	A	C	2.189950e-03	False			
+rs10439912	22	27873024	G	A	2.721729e-03	False			
+rs5762194	22	27883265	G	A	2.961735e-02	False			
+rs6005471	22	27890684	A	G	-8.057355e-03	False			
+rs761596	22	27927298	T	C	2.054268e-02	False			
+rs5997265	22	27934290	G	A	4.751755e-03	False			
+rs5762249	22	27951176	A	G	-4.329547e-04	False			
+rs762064	22	27974819	C	A	1.439093e-02	False			
+rs4822917	22	27975451	G	A	-3.648208e-02	False			
+rs6005524	22	28007741	C	T	-1.635917e-02	False			
+rs1885362	22	28016883	C	A	8.564085e-03	False			
+rs8135014	22	28046561	T	C	1.535905e-02	False			
+rs9608638	22	28060034	A	G	3.097228e-02	False			
+rs134110	22	28076058	C	T	2.848654e-02	False			
+rs1885364	22	28094845	G	A	-2.659077e-02	False			
+rs7291248	22	28130130	C	T	-1.640387e-02	False			
+rs2079095	22	28136977	A	C	-3.962775e-03	False			
+rs4822935	22	28150109	G	A	6.071392e-04	False			
+rs2283844	22	28150815	A	G	1.604724e-02	False			
+rs2267106	22	28151825	A	G	-5.390282e-03	False			
+rs2267113	22	28155404	T	C	5.030388e-03	False			
+rs4822939	22	28172577	G	T	5.704168e-03	False			
+rs12166473	22	28185452	G	T	-6.896853e-03	False			
+rs5752639	22	28200176	G	A	-6.474674e-03	False			
+rs11705555	22	28206912	C	A	-6.175542e-03	False			
+rs5997320	22	28270372	G	T	-6.768204e-04	False			
+rs742547	22	28412908	G	T	1.763639e-02	False			
+rs77885044	22	28501414	T	C	-2.304747e-01	False			
+rs1884816	22	29106733	C	T	-1.074749e-02	False			
+rs132549	22	29318724	T	C	1.743333e-03	False			
+rs17518058	22	29378610	C	T	6.690876e-04	False			
+rs134620	22	29478760	C	T	-3.029428e-02	False			
+rs34920087	22	29533572	G	C	-1.269604e-02	False			
+rs111625211	22	29626515	A	G	-1.171130e-02	False			
+rs3950176	22	29630337	A	G	2.658049e-02	False			
+rs4820803	22	29669648	C	G	-8.550535e-03	False			
+rs131190	22	29692497	T	G	1.234896e-03	False			
+rs3804076	22	29837537	C	T	1.321112e-02	False			
+rs467768	22	29961986	T	G	1.878853e-03	False			
+rs140130	22	30151687	C	T	3.418302e-03	False			
+rs76013375	22	30163526	G	A	1.576261e-02	False			
+rs2412971	22	30494371	A	G	7.959801e-03	False			
+rs713875	22	30592487	G	C	-1.047403e-01	False			
+rs76168543	22	30621613	A	C	-1.382104e-02	False			
+rs55816744	22	30658082	C	T	-3.794014e-02	False			
+rs4823086	22	30688659	T	C	2.257140e-02	False			
+rs740223	22	30762140	A	G	2.079806e-02	False			
+rs757660	22	30793137	A	G	-4.609306e-03	False			
+rs9608956	22	30901592	C	T	-8.334040e-03	False			
+rs5749118	22	30927975	T	C	3.226189e-03	False			
+rs2267161	22	30953295	T	C	-7.685790e-03	False			
+rs4820875	22	30992651	G	A	-2.565800e-02	False			
+rs1131603	22	31018975	C	T	4.241226e-02	False			
+rs5997714	22	31032920	G	A	-2.311985e-02	False			
+rs34597012	22	31063804	G	GT	-2.081808e-04	False			
+rs136382	22	31114086	G	T	2.825476e-02	False			
+rs5753303	22	31139653	A	G	2.640129e-06	False			
+rs136230	22	31214382	G	A	1.137657e-02	False			
+rs57527354	22	31216506	C	T	5.531311e-03	False			
+rs67441859	22	31272930	T	C	-1.056118e-03	False			
+rs3747151	22	31333631	C	T	-1.235089e-02	False			
+rs4820921	22	31378447	A	G	1.020507e-02	False			
+rs715297	22	31442308	A	G	-2.479126e-03	False			
+rs11089487	22	31477361	C	G	-1.263667e-02	False			
+rs5753465	22	31514348	G	A	5.803240e-03	False			
+rs2240432	22	31521404	A	G	1.097391e-02	False			
+rs5749244	22	31659495	C	T	2.663412e-02	False			
+rs7289941	22	31884405	C	T	-3.950834e-04	False			
+rs41311139	22	32200849	T	C	1.585735e-02	False			
+rs7290696	22	32341684	T	C	-2.960328e-02	False			
+rs8139657	22	32559835	G	A	-2.170436e-02	False			
+rs7291990	22	32569263	C	T	-1.296006e-03	False			
+rs5998321	22	32624139	C	T	5.619574e-03	False			
+rs5753956	22	32702816	A	G	-1.534023e-02	False			
+rs201161881	22	32756652	G	A	2.512177e-02	False			
+rs2076050	22	32831540	T	C	1.868495e-03	False			
+rs2076054	22	32832874	T	C	6.028815e-05	False			
+rs9609559	22	32853660	G	A	1.382210e-02	False			
+rs62241183	22	32854391	C	A	1.960825e-04	False			
+rs11107	22	32875190	A	G	-6.426637e-03	False			
+rs11341975	22	32934713	C	CT	-9.057754e-03	False			
+rs2157189	22	32952012	A	C	-3.802480e-03	False			
+rs2710386	22	32954443	G	A	2.210369e-03	False			
+rs62232741	22	32993032	C	T	-2.429979e-03	False			
+rs966964	22	32997766	T	C	-8.424246e-03	False			
+rs62234573	22	33045573	T	C	-3.107145e-02	False			
+rs762899	22	33046110	G	C	-6.954732e-02	False			
+rs80186738	22	33048039	T	C	1.138346e-02	False			
+rs4821083	22	33056341	C	T	-6.477198e-02	False			
+rs997120	22	33108536	T	C	-3.426392e-02	False			
+rs7286819	22	33108981	T	C	-7.404035e-02	False			
+rs743743	22	33116435	T	C	6.542471e-02	False			
+rs2157133	22	33143528	G	A	2.195059e-02	False			
+rs58039541	22	33146363	A	G	8.105390e-04	False			
+rs5749529	22	33259625	C	T	2.309793e-02	False			
+rs137560	22	33336039	T	G	-2.554387e-02	False			
+rs9609680	22	33408519	T	C	-7.556300e-03	False			
+rs4821137	22	33660345	C	G	2.190743e-03	False			
+rs117531661	22	33804893	C	T	6.680774e-03	False			
+rs5754555	22	33844303	C	T	8.923314e-03	False			
+rs9609802	22	33846914	T	C	6.295378e-03	False			
+rs62225321	22	33898906	A	C	1.958759e-05	False			
+rs86487	22	34022284	A	G	-2.579330e-03	False			
+rs239333	22	34137784	G	A	4.460828e-03	False			
+rs5999111	22	34208570	T	C	-3.365869e-03	False			
+rs9941961	22	34217757	T	C	9.289431e-03	False			
+rs10854640	22	34256923	A	C	1.439384e-02	False			
+rs79442817	22	34265402	G	A	-1.636610e-02	False			
+rs5754747	22	34284173	G	A	-2.315559e-02	False			
+rs2157153	22	34296093	C	A	-4.688326e-03	False			
+rs12169215	22	34378012	A	G	2.276664e-03	False			
+rs242898	22	34436795	C	T	1.337033e-04	False			
+rs2413215	22	34488452	A	G	-4.288310e-04	False			
+rs135198	22	34501541	A	G	2.763614e-03	False			
+rs243001	22	34514810	C	A	3.976601e-03	False			
+rs130668	22	34526428	C	T	1.088864e-02	False			
+rs5999246	22	34583078	A	G	1.802495e-03	False			
+rs753600	22	34620754	T	C	1.466546e-02	False			
+rs2097307	22	34691035	A	G	-2.082615e-04	False			
+rs411451	22	34758540	T	C	5.165532e-03	False			
+rs2609850	22	34851377	A	C	1.371180e-02	False			
+rs737821	22	35371707	T	C	-4.985554e-04	False			
+rs35433006	22	35382268	A	C	-4.931336e-03	False			
+rs7292124	22	35419122	C	T	-1.077953e-02	False			
+rs8140287	22	35478529	A	G	1.760523e-04	False			
+rs61735502	22	35481493	T	C	1.056439e-02	False			
+rs80730	22	35526281	G	A	-2.766891e-03	False			
+rs61134707	22	35603836	A	G	-1.783939e-04	False			
+rs1053593	22	35660875	T	G	3.988231e-02	False			
+rs6518950	22	35745196	G	T	1.750545e-04	False			
+rs17793276	22	35750980	A	G	-7.651136e-03	False			
+rs2071749	22	35783413	G	A	1.649791e-03	False			
+rs35806646	22	35918270	C	T	6.918713e-03	False			
+rs5750115	22	35959242	A	G	1.697538e-02	False			
+rs738368	22	35962060	G	A	5.181476e-03	False			
+rs926338	22	35964158	G	C	2.769931e-03	False			
+rs5995124	22	35984385	A	G	-1.280623e-02	False			
+rs4820205	22	36001258	C	T	1.342405e-02	False			
+rs4327313	22	36072262	T	C	4.895490e-03	False			
+rs6000004	22	36180535	G	A	-3.250252e-02	False			
+rs78188544	22	36517307	C	T	1.366076e-02	False			
+rs6000142	22	36519596	A	C	-3.499560e-03	False			
+rs9610403	22	36532058	A	G	-1.214487e-02	False			
+rs3788518	22	36543489	C	G	7.838149e-03	False			
+rs6000190	22	36600841	G	A	2.644389e-02	False			
+rs2010659	22	36629633	C	A	-6.871468e-03	False			
+rs136145	22	36635967	G	A	-2.634742e-02	False			
+rs78188930	22	36655735	A	G	-5.385142e-03	False			
+rs136176	22	36661646	A	G	-1.560741e-02	False			
+rs2269529	22	36684354	C	T	-5.170111e-03	False			
+rs75138027	22	36705622	A	G	1.713234e-02	False			
+rs3842715	22	36708049	C	CTCCTGTGA	-5.187051e-02	False			
+rs11089788	22	36751101	A	C	-2.440650e-02	False			
+rs16996704	22	36764788	G	A	2.784116e-02	False			
+rs5756223	22	36897427	C	T	2.603792e-02	False			
+rs760718	22	36900806	G	A	7.366207e-03	False			
+rs6000293	22	36923144	T	C	-1.875563e-03	False			
+rs5995298	22	36924714	G	A	-3.632594e-03	False			
+rs140020	22	36946643	T	G	1.333137e-02	False			
+rs4821501	22	36954939	T	C	1.105894e-02	False			
+rs5756255	22	36998907	T	C	-6.084687e-04	False			
+rs9622429	22	37001495	G	T	-1.224147e-02	False			
+rs2267348	22	37013167	G	C	1.866849e-02	False			
+rs6000386	22	37077364	C	T	7.294257e-03	False			
+rs738514	22	37080738	C	G	-4.873355e-03	False			
+rs2746971	22	37101890	C	T	3.991764e-02	False			
+rs933229	22	37118535	A	G	-1.713909e-03	False			
+rs62230508	22	37184521	G	A	6.515894e-03	False			
+rs4820254	22	37206341	G	T	2.566936e-04	False			
+rs11089806	22	37256262	A	G	1.152626e-03	False			
+rs4821544	22	37258503	C	T	-9.761102e-03	False			
+rs909486	22	37323988	T	C	-7.318200e-03	False			
+rs1534882	22	37329545	G	A	5.775806e-03	False			
+rs131843	22	37337409	T	C	-2.534399e-02	False			
+rs2093380	22	37343000	A	C	-4.011777e-04	False			
+rs743749	22	37398195	T	C	-1.001198e-02	False			
+rs2413447	22	37401532	A	G	-3.244795e-03	False			
+rs11554714	22	37407109	C	G	4.335972e-02	False			
+rs2543523	22	37477732	T	C	3.669548e-04	False			
+rs1861947	22	37507019	A	G	-9.259451e-04	False			
+rs28450477	22	37513316	A	G	1.153887e-03	False			
+rs3218297	22	37532441	A	G	1.802306e-02	False			
+rs2543529	22	37571497	G	A	-5.785311e-03	False			
+rs7290488	22	37581383	T	C	3.172492e-02	False			
+rs9798725	22	37621269	C	A	4.460405e-03	False			
+rs10212068	22	37644621	T	C	-8.386907e-03	False			
+rs730422	22	37671896	A	G	2.303688e-02	False			
+rs1041895	22	37679763	G	A	-2.658396e-03	False			
+rs1008184	22	37720268	G	A	2.120184e-02	False			
+rs2069221	22	37753256	C	T	8.984539e-03	False			
+rs4821645	22	37757099	G	A	-1.560347e-02	False			
+rs9610727	22	37780522	C	G	-1.496708e-02	False			
+rs9607459	22	37800175	T	C	-5.510833e-03	False			
+rs6000739	22	37846448	G	A	1.152963e-02	False			
+rs742152	22	37896749	C	T	5.447068e-03	False			
+rs6000756	22	37908435	C	T	1.909131e-03	False			
+rs12167061	22	37977481	T	C	1.465308e-02	False			
+rs75937893	22	37992699	G	A	8.339179e-04	False			
+rs36120988	22	38032762	G	GA	1.693041e-02	False			
+rs9622677	22	38054262	C	A	4.354146e-02	False			
+rs4820295	22	38083101	C	T	-2.092117e-02	False			
+rs12628603	22	38119213	A	G	3.948165e-02	False			
+rs5756795	22	38122122	C	T	4.377277e-02	False			
+rs79849571	22	38204089	T	C	2.977743e-02	False			
+rs117267625	22	38435786	T	G	-7.684278e-03	False			
+rs2284063	22	38544298	G	A	5.090446e-02	False			
+rs4608623	22	38597378	T	G	-1.997927e-02	False			
+rs4444637	22	38606780	G	A	-9.182016e-03	False			
+rs5995554	22	38630272	C	T	7.393137e-03	False			
+rs135720	22	38663819	G	A	-6.392021e-03	False			
+rs135730	22	38673234	A	G	-1.106705e-02	False			
+rs35336050	22	38685131	C	T	-4.493352e-03	False			
+rs5750581	22	38695406	T	C	-1.155972e-02	False			
+rs56182369	22	38708506	A	G	1.701713e-02	False			
+rs5757057	22	38744184	C	T	-2.112956e-02	False			
+rs743942	22	38819613	A	G	-5.625806e-03	False			
+rs12004	22	38877461	G	T	1.108728e-03	False			
+rs5750616	22	38918894	G	T	-8.094286e-03	False			
+rs112010490	22	38928269	G	T	-2.114917e-02	False			
+rs35069730	22	39027286	C	CAG	3.840735e-03	False			
+rs3747172	22	39067524	G	A	1.200232e-02	False			
+rs5757275	22	39159201	C	T	3.096214e-03	False			
+rs760482	22	39178701	G	A	2.148449e-03	False			
+rs735306	22	39260032	T	C	3.574634e-02	False			
+rs760481	22	39268785	T	G	9.377414e-03	False			
+rs5750691	22	39281774	G	T	3.816951e-02	False			
+rs5757355	22	39300265	C	T	3.540156e-02	False			
+rs1014971	22	39332623	T	C	-4.449842e-03	False			
+rs5757424	22	39415780	G	A	1.479946e-02	False			
+rs35860424	22	39448465	A	G	3.065974e-03	False			
+rs2011869	22	39480697	G	A	-4.005617e-02	False			
+rs139272	22	39487665	G	A	-1.218988e-04	False			
+rs55989856	22	39493294	C	T	-3.115929e-02	False			
+rs738469	22	39510995	G	A	-2.069106e-02	False			
+rs877529	22	39542292	A	G	9.653575e-03	False			
+rs73884827	22	39543000	T	C	-4.069841e-03	False			
+rs7287160	22	39573724	A	C	2.683694e-02	False			
+rs5750761	22	39575692	A	C	1.451305e-02	False			
+rs738470	22	39581277	A	C	1.766406e-02	False			
+rs13053714	22	39626572	A	G	-2.901981e-02	False			
+rs5757580	22	39658626	C	T	4.177065e-03	False			
+rs1569497	22	39665395	G	A	1.264611e-02	False			
+rs54211	22	39687484	G	A	5.418141e-03	False			
+rs6519183	22	39708279	A	G	-4.281532e-02	False			
+rs5757611	22	39708357	T	C	8.605574e-03	False			
+rs5750811	22	39793066	G	T	3.658209e-02	False			
+rs34026806	22	39798127	G	A	2.302129e-03	False			
+rs5757678	22	39843409	T	C	1.065699e-02	False			
+rs6001601	22	39865475	G	A	1.588501e-03	False			
+rs5757703	22	39932516	A	G	-1.179841e-02	False			
+rs62228477	22	39963426	G	A	-1.503908e-02	False			
+rs11704409	22	40023636	C	T	6.443146e-03	False			
+rs136829	22	40046176	C	T	-7.416552e-04	False			
+rs5757764	22	40067818	T	C	4.559360e-03	False			
+rs5757777	22	40092864	G	A	2.400297e-02	False			
+rs5757783	22	40127293	T	C	-8.870038e-04	False			
+rs7285609	22	40358148	T	C	-1.079902e-02	False			
+rs8139715	22	40420786	G	C	-8.092115e-03	False			
+rs7291691	22	40454069	G	T	7.898880e-03	False			
+rs732384	22	40541981	G	A	1.742640e-02	False			
+rs12484776	22	40652873	G	A	5.853057e-03	False			
+rs28360630	22	40676672	G	T	-1.894274e-03	False			
+rs470113	22	40729614	G	A	1.959940e-02	False			
+rs5757949	22	40820151	C	T	-1.628066e-02	False			
+rs35898643	22	40986372	G	C	-1.983507e-02	False			
+rs12165625	22	41494925	A	G	-2.918069e-02	False			
+rs11703267	22	41646738	G	A	3.521847e-04	False			
+rs8139705	22	41680898	T	C	1.402732e-02	False			
+rs34011394	22	41704872	T	C	6.681484e-05	False			
+rs2073167	22	41791536	C	T	-5.572333e-05	False			
+rs2076196	22	41895409	A	G	-4.407217e-02	False			
+rs2076198	22	41929175	G	T	-3.186844e-02	False			
+rs739134	22	42089623	C	T	5.322340e-03	False			
+rs147348682	22	42095658	G	T	3.846131e-02	False			
+rs139568	22	42210985	C	T	-3.139710e-03	False			
+rs13055841	22	42279653	G	A	-6.596336e-03	False			
+rs7293091	22	42341308	G	A	-6.862491e-04	False			
+rs35742686	22	42524243	C	CT	-1.181191e-02	False			
+rs762995	22	42672124	G	A	-5.278171e-03	False			
+rs1548304	22	42691238	T	C	-1.642396e-02	False			
+rs8139063	22	42813753	C	T	-3.867750e-03	False			
+rs5758742	22	42867898	G	A	-1.352327e-03	False			
+rs11553441	22	42912097	T	C	-7.295657e-04	False			
+rs4822160	22	42932317	A	G	-5.768556e-02	False			
+rs28627172	22	43010817	A	G	1.722077e-02	False			
+rs130370	22	43080028	T	C	-5.527551e-04	False			
+rs6002910	22	43096507	T	C	-5.556102e-03	False			
+rs738526	22	43112475	T	C	-1.350273e-02	False			
+rs8138149	22	43114824	G	A	-1.963192e-02	False			
+rs5758896	22	43115576	C	T	-1.880097e-02	False			
+rs9623692	22	43154299	G	A	-1.621113e-03	False			
+rs9611885	22	43159948	T	C	-7.980584e-03	False			
+rs1018448	22	43206950	C	A	-5.783037e-03	False			
+rs9607957	22	43218397	C	T	-3.976636e-03	False			
+rs2267463	22	43283255	C	A	-1.426668e-02	False			
+rs4822220	22	43290583	C	T	-3.955775e-02	False			
+rs8140884	22	43333156	A	G	-3.127845e-02	False			
+rs6003002	22	43426262	G	A	-3.668040e-03	False			
+rs8141749	22	43483242	T	C	-2.540203e-02	False			
+rs4988388	22	43515108	C	T	-1.570749e-02	False			
+rs13815	22	43529314	C	G	1.738127e-02	False			
+rs5759199	22	43551513	G	A	2.565386e-02	False			
+rs6972	22	43558972	A	G	-1.962819e-02	False			
+rs4822262	22	43577214	T	C	-2.270478e-02	False			
+rs13058467	22	43579049	C	T	-1.193909e-03	False			
+rs138993	22	43610207	G	A	-7.621661e-03	False			
+rs129415	22	43623395	G	C	-4.852519e-02	False			
+rs11703272	22	43640512	C	T	-5.533207e-03	False			
+rs139027	22	43649701	C	T	7.724845e-02	False			
+rs5751462	22	43661080	T	C	-4.251741e-02	False			
+rs739306	22	43683088	A	G	-3.582388e-03	False			
+rs4820518	22	43707996	A	G	-2.547044e-02	False			
+rs6519367	22	43711080	C	G	-5.784446e-03	False			
+rs6003156	22	43721519	C	A	3.658850e-04	False			
+rs1894717	22	43729401	C	T	8.557013e-03	False			
+rs4820525	22	43763757	T	G	-1.789810e-02	False			
+rs28673361	22	43836198	G	T	2.427697e-03	False			
+rs9614382	22	43976396	A	G	-1.277457e-02	False			
+rs137731	22	44031042	C	T	3.593107e-03	False			
+rs9614187	22	44193626	C	A	-6.865434e-03	False			
+rs138057	22	44221247	G	A	1.833991e-02	False			
+rs4823156	22	44296372	T	C	6.169212e-03	False			
+rs6006453	22	44298838	A	G	7.441756e-03	False			
+rs2294918	22	44342116	G	A	2.810328e-02	False			
+rs3761472	22	44368122	G	A	1.299680e-02	False			
+rs8418	22	44379838	G	A	1.648422e-03	False			
+rs6006598	22	44380033	C	T	-2.136788e-03	False			
+rs1007863	22	44395451	C	T	-6.698507e-03	False			
+rs7285340	22	44419871	C	T	1.816130e-02	False			
+rs6006622	22	44424108	T	C	1.036733e-02	False			
+rs130313	22	44467899	C	T	-2.592364e-03	False			
+rs9614325	22	44498134	T	C	7.281423e-03	False			
+rs1535009	22	44522312	C	T	-2.636447e-04	False			
+rs4823194	22	44526130	G	A	-3.882980e-03	False			
+rs2267613	22	44530286	A	G	2.528159e-02	False			
+rs2267614	22	44530420	C	T	-1.233654e-02	False			
+rs10483222	22	44548944	G	A	-3.947209e-03	False			
+rs77120395	22	44551755	G	A	1.262458e-02	False			
+rs9614359	22	44566434	A	G	-4.290306e-03	False			
+rs139131	22	44581046	T	C	-1.479950e-02	False			
+rs9626137	22	44643161	C	T	1.439493e-02	False			
+rs135400	22	44677081	C	T	-1.030513e-02	False			
+rs135388	22	44681612	G	A	-1.269762e-03	False			
+rs3935378	22	44695088	T	C	6.324859e-03	False			
+rs6519840	22	44707716	G	T	2.288939e-03	False			
+rs62228577	22	44725343	G	A	3.534678e-03	False			
+rs6519897	22	44738406	G	A	2.320049e-02	False			
+rs7289501	22	44746729	A	G	-1.754216e-02	False			
+rs5764718	22	44751158	G	A	-6.539695e-03	False			
+rs9614538	22	44757439	A	G	2.480295e-02	False			
+rs9614823	22	44759519	G	A	2.111274e-03	False			
+rs5765809	22	44761797	A	T	-5.311720e-03	False			
+rs5764921	22	44763352	C	G	1.452737e-02	False			
+rs19985	22	44783779	G	A	9.142699e-03	False			
+rs2071820	22	44791807	C	T	-2.371876e-02	False			
+rs2746583	22	44818986	C	T	-6.740622e-03	False			
+rs5765690	22	44894913	G	A	-5.179871e-05	False			
+rs4508	22	45058431	C	T	1.098259e-02	False			
+rs6006845	22	45066035	A	G	-1.484374e-02	False			
+rs9614870	22	45069410	T	C	1.530441e-02	False			
+rs41515447	22	45081330	G	A	1.350120e-03	False			
+rs28460735	22	45082168	C	A	3.663354e-03	False			
+rs4823364	22	45090008	G	A	2.811861e-03	False			
+rs6006857	22	45116664	C	T	1.247728e-02	False			
+rs2269543	22	45244930	T	C	-1.450041e-02	False			
+rs8881	22	45258457	G	A	-3.500519e-03	False			
+rs9614987	22	45323989	T	C	1.111338e-03	False			
+rs140556	22	45415987	A	G	-1.398184e-02	False			
+rs132067	22	45451355	G	A	-5.566982e-03	False			
+rs5765155	22	45471607	C	T	1.148978e-02	False			
+rs5765167	22	45497738	C	T	-5.029327e-03	False			
+rs7292035	22	45502829	C	T	-3.893521e-02	False			
+rs2018928	22	45519040	T	G	2.377071e-03	False			
+rs6006941	22	45523391	A	G	1.318997e-02	False			
+rs17548742	22	45573450	C	A	4.385600e-03	False			
+rs1125398	22	45589490	G	A	-8.350439e-03	False			
+rs58667	22	45668012	T	C	1.286879e-02	False			
+rs5765242	22	45671343	G	A	-2.940682e-06	False			
+rs2742648	22	45672574	T	C	5.743608e-03	False			
+rs5765250	22	45693923	A	G	-2.675069e-03	False			
+rs7290139	22	45718743	G	A	-2.092804e-02	False			
+rs11556482	22	45723807	C	G	1.670159e-03	False			
+rs6007594	22	45728370	A	G	1.879231e-04	False			
+rs56343022	22	45741537	G	T	1.420045e-02	False			
+rs5764698	22	45749983	T	G	-4.591012e-02	False			
+rs2272804	22	45809624	A	C	2.185772e-03	False			
+rs2142662	22	45821935	A	G	2.250782e-02	False			
+rs6007041	22	45837410	G	A	-2.756449e-03	False			
+rs11090631	22	45846371	T	C	7.910102e-02	False			
+rs713975	22	45864934	T	C	8.535181e-03	False			
+rs10483228	22	45871507	G	C	-7.764056e-03	False			
+rs5765426	22	45892656	G	T	-3.885653e-03	False			
+rs3810631	22	45897997	C	T	3.935204e-04	False			
+rs105199	22	45929577	C	T	-2.532217e-02	False			
+rs136755	22	45936350	A	G	-8.001698e-03	False			
+rs5765463	22	45942726	T	G	-1.415551e-02	False			
+rs13268	22	45996298	G	A	5.643525e-02	False			
+rs17564843	22	46009063	G	A	6.464843e-03	False			
+rs5765546	22	46022070	G	A	2.246740e-02	False			
+rs2239398	22	46155548	G	C	-3.247470e-02	False			
+rs136018	22	46207955	C	T	-1.354554e-03	False			
+rs136029	22	46236425	A	G	8.398423e-02	False			
+rs57514815	22	46275529	T	C	2.264300e-03	False			
+rs75427302	22	46287720	A	G	-2.237482e-02	False			
+rs28473346	22	46289699	T	C	1.872124e-02	False			
+rs9697736	22	46303347	T	C	-1.283734e-02	False			
+rs28663466	22	46316057	A	G	2.312579e-02	False			
+rs9286453	22	46337043	G	C	1.701173e-02	False			
+rs75862558	22	46347519	C	T	1.574289e-02	False			
+rs9330813	22	46364161	A	G	-4.466341e-02	False			
+rs62228062	22	46381234	G	A	4.730559e-02	False			
+rs28628653	22	46396925	G	A	1.783944e-03	False			
+rs28698504	22	46403715	A	G	-2.132589e-02	False			
+rs78358349	22	46406782	A	C	8.439466e-02	False			
+rs9627368	22	46445002	G	C	-7.613496e-02	False			
+rs7292297	22	46458123	G	T	3.328073e-02	False			
+rs9626891	22	46482948	C	T	4.241879e-02	False			
+rs12160757	22	46486508	C	T	-9.684390e-03	False			
+rs3747243	22	46493852	T	C	-6.758580e-03	False			
+rs9616125	22	46499120	C	G	-9.873118e-03	False			
+rs12170325	22	46502870	T	C	-1.792140e-02	False			
+rs76755807	22	46561713	G	A	2.604703e-02	False			
+rs4253701	22	46586110	A	G	-1.256735e-03	False			
+rs59842914	22	46592168	C	T	1.417055e-02	False			
+rs1800206	22	46614274	G	C	-5.854014e-02	False			
+rs4253772	22	46627603	T	C	8.004024e-02	False			
+rs35364389	22	46760086	T	C	3.229515e-03	False			
+rs34267201	22	46782382	T	C	-2.470821e-02	False			
+rs9627450	22	46807234	C	T	2.324176e-03	False			
+rs9306514	22	46837114	G	A	9.440730e-04	False			
+rs5768830	22	46888399	T	C	9.911095e-03	False			
+rs9615374	22	46907779	G	A	6.531440e-03	False			
+rs4823838	22	46909355	T	G	-4.780494e-03	False			
+rs12484501	22	46914277	A	C	9.689535e-03	False			
+rs3810636	22	46943687	G	A	-1.303660e-02	False			
+rs9627514	22	46985917	A	G	1.893397e-02	False			
+rs9615396	22	47021226	G	A	-1.322949e-02	False			
+rs13057352	22	47095235	A	C	-1.156013e-01	False			
+rs13054785	22	47109621	C	T	4.322858e-04	False			
+rs34301321	22	47125474	G	A	-1.746025e-02	False			
+rs17221476	22	47147117	T	C	-2.418349e-02	False			
+rs5769136	22	47156703	C	T	2.628970e-02	False			
+rs6008990	22	47245836	A	G	1.880575e-03	False			
+rs140535	22	47271747	C	T	1.055264e-03	False			
+rs5767397	22	47301822	C	T	3.032158e-03	False			
+rs9616173	22	47345487	T	C	-2.945945e-03	False			
+rs470059	22	47372368	T	C	2.067644e-02	False			
+rs136120	22	47380606	C	T	4.041426e-02	False			
+rs5769300	22	47437808	C	T	1.683027e-03	False			
+rs131924	22	47450911	A	G	1.624479e-02	False			
+rs910541	22	47511864	A	C	-4.226735e-03	False			
+rs2295246	22	47519476	T	C	-3.954111e-03	False			
+rs13055207	22	47529458	A	G	-3.602848e-04	False			
+rs4823597	22	47531320	T	C	-6.899703e-03	False			
+rs738669	22	47548321	T	C	4.925401e-03	False			
+rs2337244	22	47568291	C	T	7.726693e-03	False			
+rs15646	22	47571203	A	G	-9.744751e-03	False			
+rs135368	22	47574009	C	T	-5.327010e-03	False			
+rs136618	22	47642100	T	C	6.976251e-03	False			
+rs136636	22	47657635	T	C	1.798943e-03	False			
+rs6008118	22	47683805	C	T	-3.475544e-02	False			
+rs36008375	22	47720973	T	C	-7.868172e-03	False			
+rs17763944	22	47821952	G	A	-8.854280e-04	False			
+rs2301382	22	47893053	A	G	-2.449056e-02	False			
+rs5767784	22	47935365	C	T	-1.599879e-03	False			
+rs2285093	22	47961708	G	T	-3.593525e-03	False			
+rs131114	22	47986332	T	C	-3.976592e-03	False			
+rs9615626	22	48154645	C	T	7.608639e-03	False			
+rs5845816	22	48165452	C	CT	2.039503e-03	False			
+rs16994709	22	48207318	T	C	-9.725168e-03	False			
+rs4823698	22	48213904	G	C	-1.220367e-02	False			
+rs9615649	22	48215904	A	G	-2.488244e-05	False			
+rs738739	22	48220460	T	C	-2.702163e-03	False			
+rs738743	22	48230941	C	A	-1.129522e-03	False			
+rs4823717	22	48271961	A	G	-5.053446e-03	False			
+rs2338258	22	48284025	T	C	-3.344182e-03	False			
+rs5768135	22	48297953	C	T	-1.046958e-02	False			
+rs1028528	22	48362290	G	A	-2.367254e-02	False			
+rs28537386	22	48362914	C	A	-3.167719e-03	False			
+rs5768244	22	48387670	A	G	-8.243989e-03	False			
+rs7289071	22	48415446	C	T	2.130715e-03	False			
+rs135271	22	48460730	T	C	2.682476e-03	False			
+rs5768344	22	48491160	T	C	1.257794e-03	False			
+rs4823512	22	48519794	C	T	3.680757e-03	False			
+rs6007807	22	48537775	G	A	2.134692e-03	False			
+rs106953	22	48543566	T	C	7.314089e-03	False			
+rs133534	22	48593037	C	T	9.084708e-03	False			
+rs34776844	22	48687509	C	T	-2.771960e-02	False			
+rs5768510	22	48692033	T	C	-2.126264e-02	False			
+rs62223851	22	48699617	T	C	5.093107e-04	False			
+rs34080684	22	48717568	T	C	-8.190281e-04	False			
+rs1475987	22	48811946	C	T	7.916515e-03	False			
+rs7293013	22	48823357	G	A	1.464317e-02	False			
+rs2071750	22	48840428	A	C	3.711229e-03	False			
+rs9615896	22	48851612	T	C	-5.887765e-03	False			
+rs13056230	22	48874310	T	C	-1.106607e-02	False			
+rs761793	22	48968070	C	T	1.280691e-02	False			
+rs28658383	22	48991385	T	C	-1.234119e-02	False			
+rs34694572	22	49004050	G	A	2.290755e-02	False			
+rs28406241	22	49014565	A	G	1.555565e-03	False			
+rs7288241	22	49086481	T	C	-6.196369e-03	False			
+rs4989008	22	49107173	T	C	1.277272e-02	False			
+rs131032	22	49180915	A	G	6.346977e-03	False			
+rs4076042	22	49262579	A	G	2.657134e-02	False			
+rs28726380	22	49270317	C	T	1.447665e-03	False			
+rs2024695	22	49313196	A	G	-7.055532e-03	False			
+rs1467436	22	49335230	T	C	-6.548281e-03	False			
+rs4824067	22	49366123	T	C	1.136486e-02	False			
+rs738596	22	49372356	G	C	-2.420841e-02	False			
+rs17178683	22	49443666	T	C	1.581736e-02	False			
+rs55898343	22	49496835	G	A	-1.355414e-02	False			
+rs1981477	22	49524428	A	G	-4.228482e-03	False			
+rs135257	22	49530553	G	C	8.197389e-03	False			
+rs9627875	22	49537845	T	C	1.112550e-02	False			
+rs5769975	22	49557457	G	A	9.401926e-03	False			
+rs5769981	22	49562666	C	A	1.271701e-02	False			
+rs2318943	22	49574509	C	T	4.703177e-04	False			
+rs5769446	22	49579141	A	G	2.448619e-02	False			
+rs7288983	22	49650863	T	C	6.739571e-03	False			
+rs5770154	22	49662549	T	G	-5.769464e-03	False			
+rs1880009	22	49665841	T	C	-7.037069e-04	False			
+rs62220604	22	49677464	A	G	-2.177735e-02	False			
+rs6009594	22	49696067	C	T	-3.309682e-03	False			
+rs5770223	22	49700272	T	G	-2.541948e-03	False			
+rs1124544	22	49706433	T	C	-1.719402e-02	False			
+rs73173197	22	49713835	G	A	-1.370754e-02	False			
+rs848761	22	49719264	A	C	-1.067852e-02	False			
+rs848721	22	49743627	G	A	-5.970581e-04	False			
+rs9628005	22	49800265	C	T	3.098582e-02	False			
+rs6009666	22	49806863	A	G	3.940447e-03	False			
+rs136795	22	49830851	C	T	-2.742706e-03	False			
+rs11705513	22	49834624	G	A	-2.820163e-03	False			
+rs6009703	22	49843235	G	C	-4.458281e-04	False			
+rs9616311	22	49847501	T	G	2.235016e-03	False			
+rs4823938	22	49861033	C	T	1.721243e-02	False			
+rs5770489	22	49881321	A	G	-5.127800e-04	False			
+rs9628037	22	49908804	G	A	-9.455892e-03	False			
+rs134474	22	49911222	G	T	-1.389666e-02	False			
+rs17779492	22	49925268	A	G	1.679984e-02	False			
+rs134447	22	49927332	T	C	3.929800e-04	False			
+rs111392589	22	50109212	T	C	1.610819e-02	False			
+rs6009846	22	50118149	G	C	7.024666e-03	False			
+rs138844	22	50184484	G	T	1.222581e-02	False			
+rs117613664	22	50219447	T	C	5.091891e-02	False			
+rs910799	22	50278568	G	A	-2.340672e-02	False			
+rs78676969	22	50319170	G	A	1.669806e-02	False			
+rs28372448	22	50350971	A	G	2.640160e-02	False			
+rs4077129	22	50356693	C	T	3.851499e-03	False			
+rs5771069	22	50435480	G	A	1.663630e-02	False			
+rs9617098	22	50439626	A	G	-2.722154e-03	False			
+rs137890	22	50466542	C	T	-2.560094e-03	False			
+rs11101958	22	50470516	T	C	-1.621986e-02	False			
+rs5771133	22	50491150	G	A	1.828674e-02	False			
+rs6010164	22	50515270	C	T	1.439904e-02	False			
+rs56144269	22	50529850	C	T	2.054628e-02	False			
+rs75570992	22	50570755	C	G	7.077514e-03	False			
+rs2272837	22	50582626	G	A	-3.588854e-03	False			
+rs17836662	22	50672154	A	G	7.660848e-03	False			
+rs11547731	22	50722134	C	T	-1.747164e-02	False			
+rs79966207	22	50722408	C	T	-1.063465e-03	False			
+rs28379706	22	50728062	C	T	2.159223e-02	False			
+rs11553142	22	50750481	T	C	1.877272e-02	False			
+rs62241237	22	50758873	T	C	4.001731e-03	False			
+rs9628184	22	50835040	A	G	-6.374259e-03	False			
+rs9616997	22	50859049	C	T	3.480749e-04	False			
+rs1053744	22	50885775	G	A	-1.358311e-02	False			
+rs2232883	22	50926768	T	C	1.798498e-03	False			
+rs2232885	22	50928026	A	G	4.775504e-03	False			
+rs140522	22	50971266	C	T	2.160893e-02	False			
+rs41281529	22	50989197	T	C	-1.328884e-02	False			
+rs131778	22	50989326	G	A	1.037054e-02	False			
+rs5770892	22	50999681	G	A	-1.226224e-02	False			
+rs35826039	22	51046163	T	C	-2.754002e-02	False			
+rs9616915	22	51117580	C	T	3.573542e-02	False			
+rs2301584	22	51171497	A	G	-1.951606e-02	False			
+rs73174435	22	51174939	T	C	-6.178519e-03	False			
diff --git a/tests/data/combine/scorefile.txt b/tests/data/combine/scorefile.txt
new file mode 100644
index 0000000..1043a68
--- /dev/null
+++ b/tests/data/combine/scorefile.txt
@@ -0,0 +1,838 @@
+#pgs_name=PGS001229_22
+#genome_build=GRCh37
+chr_name	chr_position	effect_allele	other_allele	effect_weight
+22	17080378	G	A	0.01045457
+22	17300230	A	G	0.0001411475
+22	17318864	A	C	0.008166266
+22	17327595	T	C	0.007791641
+22	17409813	A	G	0.0003108784
+22	17450952	G	A	-0.03033983
+22	17492533	G	A	0.00388999
+22	17542810	C	T	0.00803629
+22	17565013	G	A	0.02135621
+22	17589209	T	C	0.003026491
+22	17600977	A	G	0.01581277
+22	17625915	A	G	-0.1172964
+22	17630486	A	C	0.01012909
+22	17633785	C	T	0.0023255
+22	17643689	A	G	0.003361814
+22	17669306	C	T	0.0214506
+22	17677699	T	C	-0.0007031384
+22	17680519	C	A	0.001079236
+22	17701234	G	A	0.004477145
+22	17703119	A	T	0.0007771872
+22	17718699	C	A	-0.01320632
+22	17721595	C	T	0.009480363
+22	17727648	T	C	0.007811685
+22	17738177	G	A	-0.004719812
+22	17749096	A	G	-0.005244795
+22	17770181	G	T	-0.03101703
+22	17793969	G	A	0.01774444
+22	17815696	G	C	-0.00551609
+22	17827684	G	A	-0.005944752
+22	17831813	T	C	0.01061587
+22	17844929	T	G	0.001717643
+22	17850661	T	C	-0.02805489
+22	17887534	A	G	0.0007723542
+22	17887725	A	G	0.007472703
+22	17958221	C	A	-0.02098647
+22	18036253	G	A	-0.01772981
+22	18038786	A	G	-0.002119071
+22	18262301	A	T	-0.005065485
+22	18289204	A	G	0.005306345
+22	18295575	C	T	0.02300129
+22	18296238	G	A	-0.005665446
+22	18319179	T	C	0.03440642
+22	18393534	A	C	0.01764269
+22	18439958	T	C	-0.002261707
+22	18483388	G	A	0.03318724
+22	18488883	C	G	-0.0191918
+22	18489048	C	A	0.01233198
+22	18495470	A	G	-0.005804926
+22	18537145	G	A	-0.004930116
+22	18571008	A	G	-8.844726E-05
+22	18584433	C	T	-0.001169893
+22	18631365	T	C	-0.001551714
+22	18650682	T	C	-0.01313784
+22	18890037	A	G	0.05968921
+22	18891398	G	A	0.006891943
+22	18892575	A	G	-0.00224447
+22	18915963	A	G	0.003719756
+22	18959581	T	C	0.006464581
+22	18963340	A	G	-0.01397565
+22	18970915	T	C	-0.001507131
+22	19024651	T	C	-0.00350575
+22	19121872	A	G	0.01644046
+22	19135603	A	G	-0.02970077
+22	19190143	T	C	0.003268027
+22	19263698	T	C	0.02057255
+22	19292446	G	T	0.01153989
+22	19371052	T	C	0.01055134
+22	19420109	C	T	-0.008628228
+22	19451186	A	C	0.02141029
+22	19518079	C	T	0.005372247
+22	19581331	T	C	0.01686942
+22	19593854	C	A	0.0006544249
+22	19606703	G	A	0.02070121
+22	19649005	A	G	0.002868601
+22	19735854	C	T	0.006262962
+22	19738355	T	C	4.97384E-05
+22	19770886	A	G	-0.01013929
+22	19781823	T	C	0.02481609
+22	19873357	T	C	0.0116302
+22	19907099	A	C	-0.0267645
+22	19968597	T	C	-0.02203945
+22	20046344	G	A	-0.009801428
+22	20084821	C	T	-0.02232886
+22	20185457	A	G	0.006892171
+22	20189077	T	C	0.01738215
+22	20219648	A	G	0.009307625
+22	20248391	A	G	-0.005405845
+22	20267213	A	G	0.006713242
+22	20286099	G	T	0.01574758
+22	20749042	G	A	0.006603339
+22	20754039	A	G	-0.01181141
+22	20775167	T	C	0.01160113
+22	20780296	A	G	0.06735311
+22	20789074	C	T	0.02844307
+22	20791438	A	C	0.0473474
+22	20793914	C	T	0.007009781
+22	20839810	T	G	0.003947346
+22	20860931	T	C	0.0005613511
+22	20979980	G	A	0.003231665
+22	20991771	G	A	0.004226765
+22	21075537	C	A	-0.002096453
+22	21154393	G	T	-0.004297086
+22	21323357	C	T	-0.006041745
+22	21331918	G	C	-0.002280912
+22	21334924	C	G	-0.02031369
+22	21356824	A	G	0.01476577
+22	21386019	A	G	0.01435557
+22	21449028	G	A	-0.01537701
+22	21463515	A	G	-0.01335614
+22	21982892	T	C	-0.06373335
+22	22001704	T	G	0.02809584
+22	22062480	T	C	0.0529113
+22	22080735	G	A	-0.0139426
+22	22151939	C	A	-0.008287849
+22	22163425	G	A	0.05518983
+22	22307519	C	G	-0.003486191
+22	22351283	G	A	-0.0007483763
+22	22394291	AG	A	0.004320583
+22	22395754	T	C	0.002587971
+22	22424302	A	C	0.0011408
+22	22473905	C	A	0.01226009
+22	22550450	G	C	0.01773244
+22	22561610	C	T	-0.006207024
+22	22581369	G	A	-0.006272413
+22	22584678	A	G	-0.00217647
+22	22711786	T	C	0.007779875
+22	22726372	T	C	0.00349632
+22	22762771	C	T	0.01252501
+22	22769923	G	A	-0.01103632
+22	22869742	A	C	-0.002412657
+22	22871922	A	G	-0.002769974
+22	22929268	T	C	-0.007035723
+22	23001481	A	G	0.007524178
+22	23022520	T	C	0.002175257
+22	23064982	A	C	-0.01255076
+22	23249440	A	C	0.02085816
+22	23268677	A	G	0.01337349
+22	23279456	C	G	-0.01371401
+22	23282286	C	T	0.004994329
+22	23325722	C	T	0.0008506657
+22	23412058	A	G	-0.009545553
+22	23627369	G	A	-0.01900175
+22	23644425	G	A	-0.0009106953
+22	23649242	G	T	0.001061643
+22	23794844	G	A	-0.01198736
+22	23804670	G	T	-0.001119846
+22	23819697	T	G	-0.01028722
+22	23873076	T	C	0.009509027
+22	23892145	T	C	0.0135128
+22	23925779	C	T	-0.004127647
+22	23960187	T	C	-0.008475905
+22	24035970	T	C	-0.001334318
+22	24086107	G	A	-0.01652957
+22	24105789	A	G	0.01813091
+22	24106448	A	G	0.001834095
+22	24186809	C	T	-0.01426541
+22	24235360	G	A	0.0003168635
+22	24255296	T	C	0.01624252
+22	24300540	T	C	-0.00322576
+22	24376584	A	G	-0.006223068
+22	24406778	A	C	0.00304654
+22	24618331	G	A	-0.0006506681
+22	24802564	A	G	-0.006695797
+22	24912232	T	C	-0.01536303
+22	24943582	A	G	-0.001687764
+22	24995668	G	A	-0.03537331
+22	25123505	C	T	-0.0160099
+22	25145094	T	C	-0.005584047
+22	25145453	T	C	-0.001388536
+22	25185823	A	G	-0.009228375
+22	25265972	A	G	0.01088906
+22	25309448	A	G	-0.002238693
+22	25363411	A	G	0.004035775
+22	25410895	G	A	0.0009720734
+22	25442369	C	T	0.01660527
+22	25454658	C	A	0.01200285
+22	25465065	C	T	0.01320801
+22	25524916	C	T	0.01147501
+22	25603008	T	C	-0.01262741
+22	25619025	G	T	-0.01212511
+22	25621591	T	C	0.01051851
+22	25643483	T	G	0.01373474
+22	25661725	A	G	-0.005936431
+22	25667883	G	A	0.01547775
+22	25668730	A	C	0.02616493
+22	25678577	T	C	0.0304018
+22	25761309	T	C	-0.001760112
+22	25761936	T	C	-0.005171998
+22	25938977	T	C	0.01966116
+22	25994013	A	G	0.0006268228
+22	26081873	T	C	0.05232603
+22	26132612	A	G	-0.006457239
+22	26133775	T	C	-0.001181527
+22	26159289	A	G	-0.008399401
+22	26181767	C	T	0.01044769
+22	26190915	G	A	0.004287533
+22	26218164	G	A	-0.002803502
+22	26231312	C	G	0.006105629
+22	26237826	C	T	0.004981479
+22	26239850	A	C	0.004144037
+22	26273893	C	G	0.005616213
+22	26278128	G	T	-0.003965338
+22	26280462	T	C	-0.0008324497
+22	26290588	T	C	-0.0130732
+22	26292659	G	A	4.294309E-05
+22	26343593	G	A	0.007813758
+22	26369358	T	C	-0.00483665
+22	26390964	A	G	-0.007849451
+22	26415475	T	C	-0.001219281
+22	26456367	G	A	-0.01285326
+22	26460519	T	C	-0.008695338
+22	26528054	A	G	0.01973023
+22	26617260	T	A	-0.01384025
+22	26638906	G	T	0.01229772
+22	26735648	A	G	0.0007879673
+22	26782251	G	A	0.0005096459
+22	26812632	C	T	-0.01850814
+22	26939781	C	T	-0.0009222796
+22	26960648	A	C	-0.005679255
+22	27038865	T	G	-0.0001487706
+22	27042828	A	G	0.02957737
+22	27161060	A	G	0.002844558
+22	27191643	T	C	0.008953731
+22	27216426	G	A	0.00912099
+22	27217018	A	G	0.01510616
+22	27240025	T	G	-0.0297174
+22	27242642	G	A	-0.009822927
+22	27246070	C	T	-0.001554199
+22	27252454	C	T	-0.006560251
+22	27264880	G	T	-0.01323094
+22	27337886	A	G	-0.009600014
+22	27339284	T	C	-0.009944488
+22	27353810	T	C	-0.002171555
+22	27370273	T	C	-0.009798478
+22	27378884	A	G	0.05145072
+22	27398749	C	T	0.001012263
+22	27403571	C	T	-0.01745865
+22	27405012	T	C	-0.005425419
+22	27415255	C	T	-0.01499362
+22	27426628	G	C	0.0228946
+22	27430724	A	G	-0.007068064
+22	27435577	C	T	-0.008632412
+22	27487580	G	A	0.003691502
+22	27498426	A	G	-0.006801544
+22	27526095	G	A	-0.0008086267
+22	27563274	C	A	0.0136965
+22	27584680	A	G	-0.002139188
+22	27628151	C	G	0.02130389
+22	27652290	T	G	0.004815735
+22	27660675	A	G	0.004899654
+22	27674832	G	T	0.0001248065
+22	27718775	A	G	0.02292384
+22	27729742	G	A	0.004951261
+22	27762155	C	T	0.00485666
+22	27781736	A	C	-0.008336242
+22	27829565	G	A	0.00285409
+22	27832985	G	C	-0.01668955
+22	27836311	G	A	-0.00775625
+22	27839704	T	C	-0.02492106
+22	27864471	A	C	0.00218995
+22	27873024	G	A	0.002721729
+22	27883265	G	A	0.02961735
+22	27890684	A	G	-0.008057355
+22	27927298	T	C	0.02054268
+22	27934290	G	A	0.004751755
+22	27951176	A	G	-0.0004329547
+22	27974819	C	A	0.01439093
+22	27975451	G	A	-0.03648208
+22	28007741	C	T	-0.01635917
+22	28016883	C	A	0.008564085
+22	28046561	T	C	0.01535905
+22	28060034	A	G	0.03097228
+22	28076058	C	T	0.02848654
+22	28094845	G	A	-0.02659077
+22	28130130	C	T	-0.01640387
+22	28136977	A	C	-0.003962775
+22	28150109	G	A	0.0006071392
+22	28150815	A	G	0.01604724
+22	28151825	A	G	-0.005390282
+22	28155404	T	C	0.005030388
+22	28172577	G	T	0.005704168
+22	28185452	G	T	-0.006896853
+22	28200176	G	A	-0.006474674
+22	28206912	C	A	-0.006175542
+22	28270372	G	T	-0.0006768204
+22	28412908	G	T	0.01763639
+22	28501414	T	C	-0.2304747
+22	29106733	C	T	-0.01074749
+22	29318724	T	C	0.001743333
+22	29378610	C	T	0.0006690876
+22	29478760	C	T	-0.03029428
+22	29533572	G	C	-0.01269604
+22	29626515	A	G	-0.0117113
+22	29630337	A	G	0.02658049
+22	29669648	C	G	-0.008550535
+22	29692497	T	G	0.001234896
+22	29837537	C	T	0.01321112
+22	29961986	T	G	0.001878853
+22	30151687	C	T	0.003418302
+22	30163526	G	A	0.01576261
+22	30494371	A	G	0.007959801
+22	30592487	G	C	-0.1047403
+22	30621613	A	C	-0.01382104
+22	30658082	C	T	-0.03794014
+22	30688659	T	C	0.0225714
+22	30762140	A	G	0.02079806
+22	30793137	A	G	-0.004609306
+22	30901592	C	T	-0.00833404
+22	30927975	T	C	0.003226189
+22	30953295	T	C	-0.00768579
+22	30992651	G	A	-0.025658
+22	31018975	C	T	0.04241226
+22	31032920	G	A	-0.02311985
+22	31063804	G	GT	-0.0002081808
+22	31114086	G	T	0.02825476
+22	31139653	A	G	2.640129E-06
+22	31214382	G	A	0.01137657
+22	31216506	C	T	0.005531311
+22	31272930	T	C	-0.001056118
+22	31333631	C	T	-0.01235089
+22	31378447	A	G	0.01020507
+22	31442308	A	G	-0.002479126
+22	31477361	C	G	-0.01263667
+22	31514348	G	A	0.00580324
+22	31521404	A	G	0.01097391
+22	31659495	C	T	0.02663412
+22	31884405	C	T	-0.0003950834
+22	32200849	T	C	0.01585735
+22	32341684	T	C	-0.02960328
+22	32559835	G	A	-0.02170436
+22	32569263	C	T	-0.001296006
+22	32624139	C	T	0.005619574
+22	32702816	A	G	-0.01534023
+22	32756652	G	A	0.02512177
+22	32831540	T	C	0.001868495
+22	32832874	T	C	6.028815E-05
+22	32853660	G	A	0.0138221
+22	32854391	C	A	0.0001960825
+22	32875190	A	G	-0.006426637
+22	32934713	C	CT	-0.009057754
+22	32952012	A	C	-0.00380248
+22	32954443	G	A	0.002210369
+22	32993032	C	T	-0.002429979
+22	32997766	T	C	-0.008424246
+22	33045573	T	C	-0.03107145
+22	33046110	G	C	-0.06954732
+22	33048039	T	C	0.01138346
+22	33056341	C	T	-0.06477198
+22	33108536	T	C	-0.03426392
+22	33108981	T	C	-0.07404035
+22	33116435	T	C	0.06542471
+22	33143528	G	A	0.02195059
+22	33146363	A	G	0.000810539
+22	33259625	C	T	0.02309793
+22	33336039	T	G	-0.02554387
+22	33408519	T	C	-0.0075563
+22	33660345	C	G	0.002190743
+22	33804893	C	T	0.006680774
+22	33844303	C	T	0.008923314
+22	33846914	T	C	0.006295378
+22	33898906	A	C	1.958759E-05
+22	34022284	A	G	-0.00257933
+22	34137784	G	A	0.004460828
+22	34208570	T	C	-0.003365869
+22	34217757	T	C	0.009289431
+22	34256923	A	C	0.01439384
+22	34265402	G	A	-0.0163661
+22	34284173	G	A	-0.02315559
+22	34296093	C	A	-0.004688326
+22	34378012	A	G	0.002276664
+22	34436795	C	T	0.0001337033
+22	34488452	A	G	-0.000428831
+22	34501541	A	G	0.002763614
+22	34514810	C	A	0.003976601
+22	34526428	C	T	0.01088864
+22	34583078	A	G	0.001802495
+22	34620754	T	C	0.01466546
+22	34691035	A	G	-0.0002082615
+22	34758540	T	C	0.005165532
+22	34851377	A	C	0.0137118
+22	35371707	T	C	-0.0004985554
+22	35382268	A	C	-0.004931336
+22	35419122	C	T	-0.01077953
+22	35478529	A	G	0.0001760523
+22	35481493	T	C	0.01056439
+22	35526281	G	A	-0.002766891
+22	35603836	A	G	-0.0001783939
+22	35660875	T	G	0.03988231
+22	35745196	G	T	0.0001750545
+22	35750980	A	G	-0.007651136
+22	35783413	G	A	0.001649791
+22	35918270	C	T	0.006918713
+22	35959242	A	G	0.01697538
+22	35962060	G	A	0.005181476
+22	35964158	G	C	0.002769931
+22	35984385	A	G	-0.01280623
+22	36001258	C	T	0.01342405
+22	36072262	T	C	0.00489549
+22	36180535	G	A	-0.03250252
+22	36517307	C	T	0.01366076
+22	36519596	A	C	-0.00349956
+22	36532058	A	G	-0.01214487
+22	36543489	C	G	0.007838149
+22	36600841	G	A	0.02644389
+22	36629633	C	A	-0.006871468
+22	36635967	G	A	-0.02634742
+22	36655735	A	G	-0.005385142
+22	36661646	A	G	-0.01560741
+22	36684354	C	T	-0.005170111
+22	36705622	A	G	0.01713234
+22	36708049	C	CTCCTGTGA	-0.05187051
+22	36751101	A	C	-0.0244065
+22	36764788	G	A	0.02784116
+22	36897427	C	T	0.02603792
+22	36900806	G	A	0.007366207
+22	36923144	T	C	-0.001875563
+22	36924714	G	A	-0.003632594
+22	36946643	T	G	0.01333137
+22	36954939	T	C	0.01105894
+22	36998907	T	C	-0.0006084687
+22	37001495	G	T	-0.01224147
+22	37013167	G	C	0.01866849
+22	37077364	C	T	0.007294257
+22	37080738	C	G	-0.004873355
+22	37101890	C	T	0.03991764
+22	37118535	A	G	-0.001713909
+22	37184521	G	A	0.006515894
+22	37206341	G	T	0.0002566936
+22	37256262	A	G	0.001152626
+22	37258503	C	T	-0.009761102
+22	37323988	T	C	-0.0073182
+22	37329545	G	A	0.005775806
+22	37337409	T	C	-0.02534399
+22	37343000	A	C	-0.0004011777
+22	37398195	T	C	-0.01001198
+22	37401532	A	G	-0.003244795
+22	37407109	C	G	0.04335972
+22	37477732	T	C	0.0003669548
+22	37507019	A	G	-0.0009259451
+22	37513316	A	G	0.001153887
+22	37532441	A	G	0.01802306
+22	37571497	G	A	-0.005785311
+22	37581383	T	C	0.03172492
+22	37621269	C	A	0.004460405
+22	37644621	T	C	-0.008386907
+22	37671896	A	G	0.02303688
+22	37679763	G	A	-0.002658396
+22	37720268	G	A	0.02120184
+22	37753256	C	T	0.008984539
+22	37757099	G	A	-0.01560347
+22	37780522	C	G	-0.01496708
+22	37800175	T	C	-0.005510833
+22	37846448	G	A	0.01152963
+22	37896749	C	T	0.005447068
+22	37908435	C	T	0.001909131
+22	37977481	T	C	0.01465308
+22	37992699	G	A	0.0008339179
+22	38032762	G	GA	0.01693041
+22	38054262	C	A	0.04354146
+22	38083101	C	T	-0.02092117
+22	38119213	A	G	0.03948165
+22	38122122	C	T	0.04377277
+22	38204089	T	C	0.02977743
+22	38435786	T	G	-0.007684278
+22	38544298	G	A	0.05090446
+22	38597378	T	G	-0.01997927
+22	38606780	G	A	-0.009182016
+22	38630272	C	T	0.007393137
+22	38663819	G	A	-0.006392021
+22	38673234	A	G	-0.01106705
+22	38685131	C	T	-0.004493352
+22	38695406	T	C	-0.01155972
+22	38708506	A	G	0.01701713
+22	38744184	C	T	-0.02112956
+22	38819613	A	G	-0.005625806
+22	38877461	G	T	0.001108728
+22	38918894	G	T	-0.008094286
+22	38928269	G	T	-0.02114917
+22	39027286	C	CAG	0.003840735
+22	39067524	G	A	0.01200232
+22	39159201	C	T	0.003096214
+22	39178701	G	A	0.002148449
+22	39260032	T	C	0.03574634
+22	39268785	T	G	0.009377414
+22	39281774	G	T	0.03816951
+22	39300265	C	T	0.03540156
+22	39332623	T	C	-0.004449842
+22	39415780	G	A	0.01479946
+22	39448465	A	G	0.003065974
+22	39480697	G	A	-0.04005617
+22	39487665	G	A	-0.0001218988
+22	39493294	C	T	-0.03115929
+22	39510995	G	A	-0.02069106
+22	39542292	A	G	0.009653575
+22	39543000	T	C	-0.004069841
+22	39573724	A	C	0.02683694
+22	39575692	A	C	0.01451305
+22	39581277	A	C	0.01766406
+22	39626572	A	G	-0.02901981
+22	39658626	C	T	0.004177065
+22	39665395	G	A	0.01264611
+22	39687484	G	A	0.005418141
+22	39708279	A	G	-0.04281532
+22	39708357	T	C	0.008605574
+22	39793066	G	T	0.03658209
+22	39798127	G	A	0.002302129
+22	39843409	T	C	0.01065699
+22	39865475	G	A	0.001588501
+22	39932516	A	G	-0.01179841
+22	39963426	G	A	-0.01503908
+22	40023636	C	T	0.006443146
+22	40046176	C	T	-0.0007416552
+22	40067818	T	C	0.00455936
+22	40092864	G	A	0.02400297
+22	40127293	T	C	-0.0008870038
+22	40358148	T	C	-0.01079902
+22	40420786	G	C	-0.008092115
+22	40454069	G	T	0.00789888
+22	40541981	G	A	0.0174264
+22	40652873	G	A	0.005853057
+22	40676672	G	T	-0.001894274
+22	40729614	G	A	0.0195994
+22	40820151	C	T	-0.01628066
+22	40986372	G	C	-0.01983507
+22	41494925	A	G	-0.02918069
+22	41646738	G	A	0.0003521847
+22	41680898	T	C	0.01402732
+22	41704872	T	C	6.681484E-05
+22	41791536	C	T	-5.572333E-05
+22	41895409	A	G	-0.04407217
+22	41929175	G	T	-0.03186844
+22	42089623	C	T	0.00532234
+22	42095658	G	T	0.03846131
+22	42210985	C	T	-0.00313971
+22	42279653	G	A	-0.006596336
+22	42341308	G	A	-0.0006862491
+22	42524243	C	CT	-0.01181191
+22	42672124	G	A	-0.005278171
+22	42691238	T	C	-0.01642396
+22	42813753	C	T	-0.00386775
+22	42867898	G	A	-0.001352327
+22	42912097	T	C	-0.0007295657
+22	42932317	A	G	-0.05768556
+22	43010817	A	G	0.01722077
+22	43080028	T	C	-0.0005527551
+22	43096507	T	C	-0.005556102
+22	43112475	T	C	-0.01350273
+22	43114824	G	A	-0.01963192
+22	43115576	C	T	-0.01880097
+22	43154299	G	A	-0.001621113
+22	43159948	T	C	-0.007980584
+22	43206950	C	A	-0.005783037
+22	43218397	C	T	-0.003976636
+22	43283255	C	A	-0.01426668
+22	43290583	C	T	-0.03955775
+22	43333156	A	G	-0.03127845
+22	43426262	G	A	-0.00366804
+22	43483242	T	C	-0.02540203
+22	43515108	C	T	-0.01570749
+22	43529314	C	G	0.01738127
+22	43551513	G	A	0.02565386
+22	43558972	A	G	-0.01962819
+22	43577214	T	C	-0.02270478
+22	43579049	C	T	-0.001193909
+22	43610207	G	A	-0.007621661
+22	43623395	G	C	-0.04852519
+22	43640512	C	T	-0.005533207
+22	43649701	C	T	0.07724845
+22	43661080	T	C	-0.04251741
+22	43683088	A	G	-0.003582388
+22	43707996	A	G	-0.02547044
+22	43711080	C	G	-0.005784446
+22	43721519	C	A	0.000365885
+22	43729401	C	T	0.008557013
+22	43763757	T	G	-0.0178981
+22	43836198	G	T	0.002427697
+22	43976396	A	G	-0.01277457
+22	44031042	C	T	0.003593107
+22	44193626	C	A	-0.006865434
+22	44221247	G	A	0.01833991
+22	44296372	T	C	0.006169212
+22	44298838	A	G	0.007441756
+22	44342116	G	A	0.02810328
+22	44368122	G	A	0.0129968
+22	44379838	G	A	0.001648422
+22	44380033	C	T	-0.002136788
+22	44395451	C	T	-0.006698507
+22	44419871	C	T	0.0181613
+22	44424108	T	C	0.01036733
+22	44467899	C	T	-0.002592364
+22	44498134	T	C	0.007281423
+22	44522312	C	T	-0.0002636447
+22	44526130	G	A	-0.00388298
+22	44530286	A	G	0.02528159
+22	44530420	C	T	-0.01233654
+22	44548944	G	A	-0.003947209
+22	44551755	G	A	0.01262458
+22	44566434	A	G	-0.004290306
+22	44581046	T	C	-0.0147995
+22	44643161	C	T	0.01439493
+22	44677081	C	T	-0.01030513
+22	44681612	G	A	-0.001269762
+22	44695088	T	C	0.006324859
+22	44707716	G	T	0.002288939
+22	44725343	G	A	0.003534678
+22	44738406	G	A	0.02320049
+22	44746729	A	G	-0.01754216
+22	44751158	G	A	-0.006539695
+22	44757439	A	G	0.02480295
+22	44759519	G	A	0.002111274
+22	44761797	A	T	-0.00531172
+22	44763352	C	G	0.01452737
+22	44783779	G	A	0.009142699
+22	44791807	C	T	-0.02371876
+22	44818986	C	T	-0.006740622
+22	44894913	G	A	-5.179871E-05
+22	45058431	C	T	0.01098259
+22	45066035	A	G	-0.01484374
+22	45069410	T	C	0.01530441
+22	45081330	G	A	0.00135012
+22	45082168	C	A	0.003663354
+22	45090008	G	A	0.002811861
+22	45116664	C	T	0.01247728
+22	45244930	T	C	-0.01450041
+22	45258457	G	A	-0.003500519
+22	45323989	T	C	0.001111338
+22	45415987	A	G	-0.01398184
+22	45451355	G	A	-0.005566982
+22	45471607	C	T	0.01148978
+22	45497738	C	T	-0.005029327
+22	45502829	C	T	-0.03893521
+22	45519040	T	G	0.002377071
+22	45523391	A	G	0.01318997
+22	45573450	C	A	0.0043856
+22	45589490	G	A	-0.008350439
+22	45668012	T	C	0.01286879
+22	45671343	G	A	-2.940682E-06
+22	45672574	T	C	0.005743608
+22	45693923	A	G	-0.002675069
+22	45718743	G	A	-0.02092804
+22	45723807	C	G	0.001670159
+22	45728370	A	G	0.0001879231
+22	45741537	G	T	0.01420045
+22	45749983	T	G	-0.04591012
+22	45809624	A	C	0.002185772
+22	45821935	A	G	0.02250782
+22	45837410	G	A	-0.002756449
+22	45846371	T	C	0.07910102
+22	45864934	T	C	0.008535181
+22	45871507	G	C	-0.007764056
+22	45892656	G	T	-0.003885653
+22	45897997	C	T	0.0003935204
+22	45929577	C	T	-0.02532217
+22	45936350	A	G	-0.008001698
+22	45942726	T	G	-0.01415551
+22	45996298	G	A	0.05643525
+22	46009063	G	A	0.006464843
+22	46022070	G	A	0.0224674
+22	46155548	G	C	-0.0324747
+22	46207955	C	T	-0.001354554
+22	46236425	A	G	0.08398423
+22	46275529	T	C	0.0022643
+22	46287720	A	G	-0.02237482
+22	46289699	T	C	0.01872124
+22	46303347	T	C	-0.01283734
+22	46316057	A	G	0.02312579
+22	46337043	G	C	0.01701173
+22	46347519	C	T	0.01574289
+22	46364161	A	G	-0.04466341
+22	46381234	G	A	0.04730559
+22	46396925	G	A	0.001783944
+22	46403715	A	G	-0.02132589
+22	46406782	A	C	0.08439466
+22	46445002	G	C	-0.07613496
+22	46458123	G	T	0.03328073
+22	46482948	C	T	0.04241879
+22	46486508	C	T	-0.00968439
+22	46493852	T	C	-0.00675858
+22	46499120	C	G	-0.009873118
+22	46502870	T	C	-0.0179214
+22	46561713	G	A	0.02604703
+22	46586110	A	G	-0.001256735
+22	46592168	C	T	0.01417055
+22	46614274	G	C	-0.05854014
+22	46627603	T	C	0.08004024
+22	46760086	T	C	0.003229515
+22	46782382	T	C	-0.02470821
+22	46807234	C	T	0.002324176
+22	46837114	G	A	0.000944073
+22	46888399	T	C	0.009911095
+22	46907779	G	A	0.00653144
+22	46909355	T	G	-0.004780494
+22	46914277	A	C	0.009689535
+22	46943687	G	A	-0.0130366
+22	46985917	A	G	0.01893397
+22	47021226	G	A	-0.01322949
+22	47095235	A	C	-0.1156013
+22	47109621	C	T	0.0004322858
+22	47125474	G	A	-0.01746025
+22	47147117	T	C	-0.02418349
+22	47156703	C	T	0.0262897
+22	47245836	A	G	0.001880575
+22	47271747	C	T	0.001055264
+22	47301822	C	T	0.003032158
+22	47345487	T	C	-0.002945945
+22	47372368	T	C	0.02067644
+22	47380606	C	T	0.04041426
+22	47437808	C	T	0.001683027
+22	47450911	A	G	0.01624479
+22	47511864	A	C	-0.004226735
+22	47519476	T	C	-0.003954111
+22	47529458	A	G	-0.0003602848
+22	47531320	T	C	-0.006899703
+22	47548321	T	C	0.004925401
+22	47568291	C	T	0.007726693
+22	47571203	A	G	-0.009744751
+22	47574009	C	T	-0.00532701
+22	47642100	T	C	0.006976251
+22	47657635	T	C	0.001798943
+22	47683805	C	T	-0.03475544
+22	47720973	T	C	-0.007868172
+22	47821952	G	A	-0.000885428
+22	47893053	A	G	-0.02449056
+22	47935365	C	T	-0.001599879
+22	47961708	G	T	-0.003593525
+22	47986332	T	C	-0.003976592
+22	48154645	C	T	0.007608639
+22	48165452	C	CT	0.002039503
+22	48207318	T	C	-0.009725168
+22	48213904	G	C	-0.01220367
+22	48215904	A	G	-2.488244E-05
+22	48220460	T	C	-0.002702163
+22	48230941	C	A	-0.001129522
+22	48271961	A	G	-0.005053446
+22	48284025	T	C	-0.003344182
+22	48297953	C	T	-0.01046958
+22	48362290	G	A	-0.02367254
+22	48362914	C	A	-0.003167719
+22	48387670	A	G	-0.008243989
+22	48415446	C	T	0.002130715
+22	48460730	T	C	0.002682476
+22	48491160	T	C	0.001257794
+22	48519794	C	T	0.003680757
+22	48537775	G	A	0.002134692
+22	48543566	T	C	0.007314089
+22	48593037	C	T	0.009084708
+22	48687509	C	T	-0.0277196
+22	48692033	T	C	-0.02126264
+22	48699617	T	C	0.0005093107
+22	48717568	T	C	-0.0008190281
+22	48811946	C	T	0.007916515
+22	48823357	G	A	0.01464317
+22	48840428	A	C	0.003711229
+22	48851612	T	C	-0.005887765
+22	48874310	T	C	-0.01106607
+22	48968070	C	T	0.01280691
+22	48991385	T	C	-0.01234119
+22	49004050	G	A	0.02290755
+22	49014565	A	G	0.001555565
+22	49086481	T	C	-0.006196369
+22	49107173	T	C	0.01277272
+22	49180915	A	G	0.006346977
+22	49262579	A	G	0.02657134
+22	49270317	C	T	0.001447665
+22	49313196	A	G	-0.007055532
+22	49335230	T	C	-0.006548281
+22	49366123	T	C	0.01136486
+22	49372356	G	C	-0.02420841
+22	49443666	T	C	0.01581736
+22	49496835	G	A	-0.01355414
+22	49524428	A	G	-0.004228482
+22	49530553	G	C	0.008197389
+22	49537845	T	C	0.0111255
+22	49557457	G	A	0.009401926
+22	49562666	C	A	0.01271701
+22	49574509	C	T	0.0004703177
+22	49579141	A	G	0.02448619
+22	49650863	T	C	0.006739571
+22	49662549	T	G	-0.005769464
+22	49665841	T	C	-0.0007037069
+22	49677464	A	G	-0.02177735
+22	49696067	C	T	-0.003309682
+22	49700272	T	G	-0.002541948
+22	49706433	T	C	-0.01719402
+22	49713835	G	A	-0.01370754
+22	49719264	A	C	-0.01067852
+22	49743627	G	A	-0.0005970581
+22	49800265	C	T	0.03098582
+22	49806863	A	G	0.003940447
+22	49830851	C	T	-0.002742706
+22	49834624	G	A	-0.002820163
+22	49843235	G	C	-0.0004458281
+22	49847501	T	G	0.002235016
+22	49861033	C	T	0.01721243
+22	49881321	A	G	-0.00051278
+22	49908804	G	A	-0.009455892
+22	49911222	G	T	-0.01389666
+22	49925268	A	G	0.01679984
+22	49927332	T	C	0.00039298
+22	50109212	T	C	0.01610819
+22	50118149	G	C	0.007024666
+22	50184484	G	T	0.01222581
+22	50219447	T	C	0.05091891
+22	50278568	G	A	-0.02340672
+22	50319170	G	A	0.01669806
+22	50350971	A	G	0.0264016
+22	50356693	C	T	0.003851499
+22	50435480	G	A	0.0166363
+22	50439626	A	G	-0.002722154
+22	50466542	C	T	-0.002560094
+22	50470516	T	C	-0.01621986
+22	50491150	G	A	0.01828674
+22	50515270	C	T	0.01439904
+22	50529850	C	T	0.02054628
+22	50570755	C	G	0.007077514
+22	50582626	G	A	-0.003588854
+22	50672154	A	G	0.007660848
+22	50722134	C	T	-0.01747164
+22	50722408	C	T	-0.001063465
+22	50728062	C	T	0.02159223
+22	50750481	T	C	0.01877272
+22	50758873	T	C	0.004001731
+22	50835040	A	G	-0.006374259
+22	50859049	C	T	0.0003480749
+22	50885775	G	A	-0.01358311
+22	50926768	T	C	0.001798498
+22	50928026	A	G	0.004775504
+22	50971266	C	T	0.02160893
+22	50989197	T	C	-0.01328884
+22	50989326	G	A	0.01037054
+22	50999681	G	A	-0.01226224
+22	51046163	T	C	-0.02754002
+22	51117580	C	T	0.03573542
+22	51171497	A	G	-0.01951606
+22	51174939	T	C	-0.006178519
diff --git a/tests/data/combine/scorefile_dominant_and_recessive.txt b/tests/data/combine/scorefile_dominant_and_recessive.txt
new file mode 100644
index 0000000..bbf23f0
--- /dev/null
+++ b/tests/data/combine/scorefile_dominant_and_recessive.txt
@@ -0,0 +1,838 @@
+#pgs_name=PGS001229_22_DominantRecessiveExample
+#genome_build=GRCh37
+chr_name	chr_position	effect_allele	other_allele	effect_weight	is_dominant	is_recessive
+22	17080378	G	A	0.01045457	TRUE	FALSE
+22	17300230	A	G	0.0001411475	FALSE	TRUE
+22	17318864	A	C	0.008166266	FALSE	FALSE
+22	17327595	T	C	0.007791641	FALSE	FALSE
+22	17409813	A	G	0.0003108784	FALSE	FALSE
+22	17450952	G	A	-0.03033983	FALSE	FALSE
+22	17492533	G	A	0.00388999	FALSE	FALSE
+22	17542810	C	T	0.00803629	FALSE	FALSE
+22	17565013	G	A	0.02135621	FALSE	FALSE
+22	17589209	T	C	0.003026491	FALSE	FALSE
+22	17600977	A	G	0.01581277	FALSE	FALSE
+22	17625915	A	G	-0.1172964	FALSE	FALSE
+22	17630486	A	C	0.01012909	FALSE	FALSE
+22	17633785	C	T	0.0023255	FALSE	FALSE
+22	17643689	A	G	0.003361814	FALSE	FALSE
+22	17669306	C	T	0.0214506	FALSE	FALSE
+22	17677699	T	C	-0.0007031384	FALSE	FALSE
+22	17680519	C	A	0.001079236	FALSE	FALSE
+22	17701234	G	A	0.004477145	FALSE	FALSE
+22	17703119	A	T	0.0007771872	FALSE	FALSE
+22	17718699	C	A	-0.01320632	FALSE	FALSE
+22	17721595	C	T	0.009480363	FALSE	FALSE
+22	17727648	T	C	0.007811685	FALSE	FALSE
+22	17738177	G	A	-0.004719812	FALSE	FALSE
+22	17749096	A	G	-0.005244795	FALSE	FALSE
+22	17770181	G	T	-0.03101703	FALSE	FALSE
+22	17793969	G	A	0.01774444	FALSE	FALSE
+22	17815696	G	C	-0.00551609	FALSE	FALSE
+22	17827684	G	A	-0.005944752	FALSE	FALSE
+22	17831813	T	C	0.01061587	FALSE	FALSE
+22	17844929	T	G	0.001717643	FALSE	FALSE
+22	17850661	T	C	-0.02805489	FALSE	FALSE
+22	17887534	A	G	0.0007723542	FALSE	FALSE
+22	17887725	A	G	0.007472703	FALSE	FALSE
+22	17958221	C	A	-0.02098647	FALSE	FALSE
+22	18036253	G	A	-0.01772981	FALSE	FALSE
+22	18038786	A	G	-0.002119071	FALSE	FALSE
+22	18262301	A	T	-0.005065485	FALSE	FALSE
+22	18289204	A	G	0.005306345	FALSE	FALSE
+22	18295575	C	T	0.02300129	FALSE	FALSE
+22	18296238	G	A	-0.005665446	FALSE	FALSE
+22	18319179	T	C	0.03440642	FALSE	FALSE
+22	18393534	A	C	0.01764269	FALSE	FALSE
+22	18439958	T	C	-0.002261707	FALSE	FALSE
+22	18483388	G	A	0.03318724	FALSE	FALSE
+22	18488883	C	G	-0.0191918	FALSE	FALSE
+22	18489048	C	A	0.01233198	FALSE	FALSE
+22	18495470	A	G	-0.005804926	FALSE	FALSE
+22	18537145	G	A	-0.004930116	FALSE	FALSE
+22	18571008	A	G	-8.844726E-05	FALSE	FALSE
+22	18584433	C	T	-0.001169893	FALSE	FALSE
+22	18631365	T	C	-0.001551714	FALSE	FALSE
+22	18650682	T	C	-0.01313784	FALSE	FALSE
+22	18890037	A	G	0.05968921	FALSE	FALSE
+22	18891398	G	A	0.006891943	FALSE	FALSE
+22	18892575	A	G	-0.00224447	FALSE	FALSE
+22	18915963	A	G	0.003719756	FALSE	FALSE
+22	18959581	T	C	0.006464581	FALSE	FALSE
+22	18963340	A	G	-0.01397565	FALSE	FALSE
+22	18970915	T	C	-0.001507131	FALSE	FALSE
+22	19024651	T	C	-0.00350575	FALSE	FALSE
+22	19121872	A	G	0.01644046	FALSE	FALSE
+22	19135603	A	G	-0.02970077	FALSE	FALSE
+22	19190143	T	C	0.003268027	FALSE	FALSE
+22	19263698	T	C	0.02057255	FALSE	FALSE
+22	19292446	G	T	0.01153989	FALSE	FALSE
+22	19371052	T	C	0.01055134	FALSE	FALSE
+22	19420109	C	T	-0.008628228	FALSE	FALSE
+22	19451186	A	C	0.02141029	FALSE	FALSE
+22	19518079	C	T	0.005372247	FALSE	FALSE
+22	19581331	T	C	0.01686942	FALSE	FALSE
+22	19593854	C	A	0.0006544249	FALSE	FALSE
+22	19606703	G	A	0.02070121	FALSE	FALSE
+22	19649005	A	G	0.002868601	FALSE	FALSE
+22	19735854	C	T	0.006262962	FALSE	FALSE
+22	19738355	T	C	4.97384E-05	FALSE	FALSE
+22	19770886	A	G	-0.01013929	FALSE	FALSE
+22	19781823	T	C	0.02481609	FALSE	FALSE
+22	19873357	T	C	0.0116302	FALSE	FALSE
+22	19907099	A	C	-0.0267645	FALSE	FALSE
+22	19968597	T	C	-0.02203945	FALSE	FALSE
+22	20046344	G	A	-0.009801428	FALSE	FALSE
+22	20084821	C	T	-0.02232886	FALSE	FALSE
+22	20185457	A	G	0.006892171	FALSE	FALSE
+22	20189077	T	C	0.01738215	FALSE	FALSE
+22	20219648	A	G	0.009307625	FALSE	FALSE
+22	20248391	A	G	-0.005405845	FALSE	FALSE
+22	20267213	A	G	0.006713242	FALSE	FALSE
+22	20286099	G	T	0.01574758	FALSE	FALSE
+22	20749042	G	A	0.006603339	FALSE	FALSE
+22	20754039	A	G	-0.01181141	FALSE	FALSE
+22	20775167	T	C	0.01160113	FALSE	FALSE
+22	20780296	A	G	0.06735311	FALSE	FALSE
+22	20789074	C	T	0.02844307	FALSE	FALSE
+22	20791438	A	C	0.0473474	FALSE	FALSE
+22	20793914	C	T	0.007009781	FALSE	FALSE
+22	20839810	T	G	0.003947346	FALSE	FALSE
+22	20860931	T	C	0.0005613511	FALSE	FALSE
+22	20979980	G	A	0.003231665	FALSE	FALSE
+22	20991771	G	A	0.004226765	FALSE	FALSE
+22	21075537	C	A	-0.002096453	FALSE	FALSE
+22	21154393	G	T	-0.004297086	FALSE	FALSE
+22	21323357	C	T	-0.006041745	FALSE	FALSE
+22	21331918	G	C	-0.002280912	FALSE	FALSE
+22	21334924	C	G	-0.02031369	FALSE	FALSE
+22	21356824	A	G	0.01476577	FALSE	FALSE
+22	21386019	A	G	0.01435557	FALSE	FALSE
+22	21449028	G	A	-0.01537701	FALSE	FALSE
+22	21463515	A	G	-0.01335614	FALSE	FALSE
+22	21982892	T	C	-0.06373335	FALSE	FALSE
+22	22001704	T	G	0.02809584	FALSE	FALSE
+22	22062480	T	C	0.0529113	FALSE	FALSE
+22	22080735	G	A	-0.0139426	FALSE	FALSE
+22	22151939	C	A	-0.008287849	FALSE	FALSE
+22	22163425	G	A	0.05518983	FALSE	FALSE
+22	22307519	C	G	-0.003486191	FALSE	FALSE
+22	22351283	G	A	-0.0007483763	FALSE	FALSE
+22	22394291	AG	A	0.004320583	FALSE	FALSE
+22	22395754	T	C	0.002587971	FALSE	FALSE
+22	22424302	A	C	0.0011408	FALSE	FALSE
+22	22473905	C	A	0.01226009	FALSE	FALSE
+22	22550450	G	C	0.01773244	FALSE	FALSE
+22	22561610	C	T	-0.006207024	FALSE	FALSE
+22	22581369	G	A	-0.006272413	FALSE	FALSE
+22	22584678	A	G	-0.00217647	FALSE	FALSE
+22	22711786	T	C	0.007779875	FALSE	FALSE
+22	22726372	T	C	0.00349632	FALSE	FALSE
+22	22762771	C	T	0.01252501	FALSE	FALSE
+22	22769923	G	A	-0.01103632	FALSE	FALSE
+22	22869742	A	C	-0.002412657	FALSE	FALSE
+22	22871922	A	G	-0.002769974	FALSE	FALSE
+22	22929268	T	C	-0.007035723	FALSE	FALSE
+22	23001481	A	G	0.007524178	FALSE	FALSE
+22	23022520	T	C	0.002175257	FALSE	FALSE
+22	23064982	A	C	-0.01255076	FALSE	FALSE
+22	23249440	A	C	0.02085816	FALSE	FALSE
+22	23268677	A	G	0.01337349	FALSE	FALSE
+22	23279456	C	G	-0.01371401	FALSE	FALSE
+22	23282286	C	T	0.004994329	FALSE	FALSE
+22	23325722	C	T	0.0008506657	FALSE	FALSE
+22	23412058	A	G	-0.009545553	FALSE	FALSE
+22	23627369	G	A	-0.01900175	FALSE	FALSE
+22	23644425	G	A	-0.0009106953	FALSE	FALSE
+22	23649242	G	T	0.001061643	FALSE	FALSE
+22	23794844	G	A	-0.01198736	FALSE	FALSE
+22	23804670	G	T	-0.001119846	FALSE	FALSE
+22	23819697	T	G	-0.01028722	FALSE	FALSE
+22	23873076	T	C	0.009509027	FALSE	FALSE
+22	23892145	T	C	0.0135128	FALSE	FALSE
+22	23925779	C	T	-0.004127647	FALSE	FALSE
+22	23960187	T	C	-0.008475905	FALSE	FALSE
+22	24035970	T	C	-0.001334318	FALSE	FALSE
+22	24086107	G	A	-0.01652957	FALSE	FALSE
+22	24105789	A	G	0.01813091	FALSE	FALSE
+22	24106448	A	G	0.001834095	FALSE	FALSE
+22	24186809	C	T	-0.01426541	FALSE	FALSE
+22	24235360	G	A	0.0003168635	FALSE	FALSE
+22	24255296	T	C	0.01624252	FALSE	FALSE
+22	24300540	T	C	-0.00322576	FALSE	FALSE
+22	24376584	A	G	-0.006223068	FALSE	FALSE
+22	24406778	A	C	0.00304654	FALSE	FALSE
+22	24618331	G	A	-0.0006506681	FALSE	FALSE
+22	24802564	A	G	-0.006695797	FALSE	FALSE
+22	24912232	T	C	-0.01536303	FALSE	FALSE
+22	24943582	A	G	-0.001687764	FALSE	FALSE
+22	24995668	G	A	-0.03537331	FALSE	FALSE
+22	25123505	C	T	-0.0160099	FALSE	FALSE
+22	25145094	T	C	-0.005584047	FALSE	FALSE
+22	25145453	T	C	-0.001388536	FALSE	FALSE
+22	25185823	A	G	-0.009228375	FALSE	FALSE
+22	25265972	A	G	0.01088906	FALSE	FALSE
+22	25309448	A	G	-0.002238693	FALSE	FALSE
+22	25363411	A	G	0.004035775	FALSE	FALSE
+22	25410895	G	A	0.0009720734	FALSE	FALSE
+22	25442369	C	T	0.01660527	FALSE	FALSE
+22	25454658	C	A	0.01200285	FALSE	FALSE
+22	25465065	C	T	0.01320801	FALSE	FALSE
+22	25524916	C	T	0.01147501	FALSE	FALSE
+22	25603008	T	C	-0.01262741	FALSE	FALSE
+22	25619025	G	T	-0.01212511	FALSE	FALSE
+22	25621591	T	C	0.01051851	FALSE	FALSE
+22	25643483	T	G	0.01373474	FALSE	FALSE
+22	25661725	A	G	-0.005936431	FALSE	FALSE
+22	25667883	G	A	0.01547775	FALSE	FALSE
+22	25668730	A	C	0.02616493	FALSE	FALSE
+22	25678577	T	C	0.0304018	FALSE	FALSE
+22	25761309	T	C	-0.001760112	FALSE	FALSE
+22	25761936	T	C	-0.005171998	FALSE	FALSE
+22	25938977	T	C	0.01966116	FALSE	FALSE
+22	25994013	A	G	0.0006268228	FALSE	FALSE
+22	26081873	T	C	0.05232603	FALSE	FALSE
+22	26132612	A	G	-0.006457239	FALSE	FALSE
+22	26133775	T	C	-0.001181527	FALSE	FALSE
+22	26159289	A	G	-0.008399401	FALSE	FALSE
+22	26181767	C	T	0.01044769	FALSE	FALSE
+22	26190915	G	A	0.004287533	FALSE	FALSE
+22	26218164	G	A	-0.002803502	FALSE	FALSE
+22	26231312	C	G	0.006105629	FALSE	FALSE
+22	26237826	C	T	0.004981479	FALSE	FALSE
+22	26239850	A	C	0.004144037	FALSE	FALSE
+22	26273893	C	G	0.005616213	FALSE	FALSE
+22	26278128	G	T	-0.003965338	FALSE	FALSE
+22	26280462	T	C	-0.0008324497	FALSE	FALSE
+22	26290588	T	C	-0.0130732	FALSE	FALSE
+22	26292659	G	A	4.294309E-05	FALSE	FALSE
+22	26343593	G	A	0.007813758	FALSE	FALSE
+22	26369358	T	C	-0.00483665	FALSE	FALSE
+22	26390964	A	G	-0.007849451	FALSE	FALSE
+22	26415475	T	C	-0.001219281	FALSE	FALSE
+22	26456367	G	A	-0.01285326	FALSE	FALSE
+22	26460519	T	C	-0.008695338	FALSE	FALSE
+22	26528054	A	G	0.01973023	FALSE	FALSE
+22	26617260	T	A	-0.01384025	FALSE	FALSE
+22	26638906	G	T	0.01229772	FALSE	FALSE
+22	26735648	A	G	0.0007879673	FALSE	FALSE
+22	26782251	G	A	0.0005096459	FALSE	FALSE
+22	26812632	C	T	-0.01850814	FALSE	FALSE
+22	26939781	C	T	-0.0009222796	FALSE	FALSE
+22	26960648	A	C	-0.005679255	FALSE	FALSE
+22	27038865	T	G	-0.0001487706	FALSE	FALSE
+22	27042828	A	G	0.02957737	FALSE	FALSE
+22	27161060	A	G	0.002844558	FALSE	FALSE
+22	27191643	T	C	0.008953731	FALSE	FALSE
+22	27216426	G	A	0.00912099	FALSE	FALSE
+22	27217018	A	G	0.01510616	FALSE	FALSE
+22	27240025	T	G	-0.0297174	FALSE	FALSE
+22	27242642	G	A	-0.009822927	FALSE	FALSE
+22	27246070	C	T	-0.001554199	FALSE	FALSE
+22	27252454	C	T	-0.006560251	FALSE	FALSE
+22	27264880	G	T	-0.01323094	FALSE	FALSE
+22	27337886	A	G	-0.009600014	FALSE	FALSE
+22	27339284	T	C	-0.009944488	FALSE	FALSE
+22	27353810	T	C	-0.002171555	FALSE	FALSE
+22	27370273	T	C	-0.009798478	FALSE	FALSE
+22	27378884	A	G	0.05145072	FALSE	FALSE
+22	27398749	C	T	0.001012263	FALSE	FALSE
+22	27403571	C	T	-0.01745865	FALSE	FALSE
+22	27405012	T	C	-0.005425419	FALSE	FALSE
+22	27415255	C	T	-0.01499362	FALSE	FALSE
+22	27426628	G	C	0.0228946	FALSE	FALSE
+22	27430724	A	G	-0.007068064	FALSE	FALSE
+22	27435577	C	T	-0.008632412	FALSE	FALSE
+22	27487580	G	A	0.003691502	FALSE	FALSE
+22	27498426	A	G	-0.006801544	FALSE	FALSE
+22	27526095	G	A	-0.0008086267	FALSE	FALSE
+22	27563274	C	A	0.0136965	FALSE	FALSE
+22	27584680	A	G	-0.002139188	FALSE	FALSE
+22	27628151	C	G	0.02130389	FALSE	FALSE
+22	27652290	T	G	0.004815735	FALSE	FALSE
+22	27660675	A	G	0.004899654	FALSE	FALSE
+22	27674832	G	T	0.0001248065	FALSE	FALSE
+22	27718775	A	G	0.02292384	FALSE	FALSE
+22	27729742	G	A	0.004951261	FALSE	FALSE
+22	27762155	C	T	0.00485666	FALSE	FALSE
+22	27781736	A	C	-0.008336242	FALSE	FALSE
+22	27829565	G	A	0.00285409	FALSE	FALSE
+22	27832985	G	C	-0.01668955	FALSE	FALSE
+22	27836311	G	A	-0.00775625	FALSE	FALSE
+22	27839704	T	C	-0.02492106	FALSE	FALSE
+22	27864471	A	C	0.00218995	FALSE	FALSE
+22	27873024	G	A	0.002721729	FALSE	FALSE
+22	27883265	G	A	0.02961735	FALSE	FALSE
+22	27890684	A	G	-0.008057355	FALSE	FALSE
+22	27927298	T	C	0.02054268	FALSE	FALSE
+22	27934290	G	A	0.004751755	FALSE	FALSE
+22	27951176	A	G	-0.0004329547	FALSE	FALSE
+22	27974819	C	A	0.01439093	FALSE	FALSE
+22	27975451	G	A	-0.03648208	FALSE	FALSE
+22	28007741	C	T	-0.01635917	FALSE	FALSE
+22	28016883	C	A	0.008564085	FALSE	FALSE
+22	28046561	T	C	0.01535905	FALSE	FALSE
+22	28060034	A	G	0.03097228	FALSE	FALSE
+22	28076058	C	T	0.02848654	FALSE	FALSE
+22	28094845	G	A	-0.02659077	FALSE	FALSE
+22	28130130	C	T	-0.01640387	FALSE	FALSE
+22	28136977	A	C	-0.003962775	FALSE	FALSE
+22	28150109	G	A	0.0006071392	FALSE	FALSE
+22	28150815	A	G	0.01604724	FALSE	FALSE
+22	28151825	A	G	-0.005390282	FALSE	FALSE
+22	28155404	T	C	0.005030388	FALSE	FALSE
+22	28172577	G	T	0.005704168	FALSE	FALSE
+22	28185452	G	T	-0.006896853	FALSE	FALSE
+22	28200176	G	A	-0.006474674	FALSE	FALSE
+22	28206912	C	A	-0.006175542	FALSE	FALSE
+22	28270372	G	T	-0.0006768204	FALSE	FALSE
+22	28412908	G	T	0.01763639	FALSE	FALSE
+22	28501414	T	C	-0.2304747	FALSE	FALSE
+22	29106733	C	T	-0.01074749	FALSE	FALSE
+22	29318724	T	C	0.001743333	FALSE	FALSE
+22	29378610	C	T	0.0006690876	FALSE	FALSE
+22	29478760	C	T	-0.03029428	FALSE	FALSE
+22	29533572	G	C	-0.01269604	FALSE	FALSE
+22	29626515	A	G	-0.0117113	FALSE	FALSE
+22	29630337	A	G	0.02658049	FALSE	FALSE
+22	29669648	C	G	-0.008550535	FALSE	FALSE
+22	29692497	T	G	0.001234896	FALSE	FALSE
+22	29837537	C	T	0.01321112	FALSE	FALSE
+22	29961986	T	G	0.001878853	FALSE	FALSE
+22	30151687	C	T	0.003418302	FALSE	FALSE
+22	30163526	G	A	0.01576261	FALSE	FALSE
+22	30494371	A	G	0.007959801	FALSE	FALSE
+22	30592487	G	C	-0.1047403	FALSE	FALSE
+22	30621613	A	C	-0.01382104	FALSE	FALSE
+22	30658082	C	T	-0.03794014	FALSE	FALSE
+22	30688659	T	C	0.0225714	FALSE	FALSE
+22	30762140	A	G	0.02079806	FALSE	FALSE
+22	30793137	A	G	-0.004609306	FALSE	FALSE
+22	30901592	C	T	-0.00833404	FALSE	FALSE
+22	30927975	T	C	0.003226189	FALSE	FALSE
+22	30953295	T	C	-0.00768579	FALSE	FALSE
+22	30992651	G	A	-0.025658	FALSE	FALSE
+22	31018975	C	T	0.04241226	FALSE	FALSE
+22	31032920	G	A	-0.02311985	FALSE	FALSE
+22	31063804	G	GT	-0.0002081808	FALSE	FALSE
+22	31114086	G	T	0.02825476	FALSE	FALSE
+22	31139653	A	G	2.640129E-06	FALSE	FALSE
+22	31214382	G	A	0.01137657	FALSE	FALSE
+22	31216506	C	T	0.005531311	FALSE	FALSE
+22	31272930	T	C	-0.001056118	FALSE	FALSE
+22	31333631	C	T	-0.01235089	FALSE	FALSE
+22	31378447	A	G	0.01020507	FALSE	FALSE
+22	31442308	A	G	-0.002479126	FALSE	FALSE
+22	31477361	C	G	-0.01263667	FALSE	FALSE
+22	31514348	G	A	0.00580324	FALSE	FALSE
+22	31521404	A	G	0.01097391	FALSE	FALSE
+22	31659495	C	T	0.02663412	FALSE	FALSE
+22	31884405	C	T	-0.0003950834	FALSE	FALSE
+22	32200849	T	C	0.01585735	FALSE	FALSE
+22	32341684	T	C	-0.02960328	FALSE	FALSE
+22	32559835	G	A	-0.02170436	FALSE	FALSE
+22	32569263	C	T	-0.001296006	FALSE	FALSE
+22	32624139	C	T	0.005619574	FALSE	FALSE
+22	32702816	A	G	-0.01534023	FALSE	FALSE
+22	32756652	G	A	0.02512177	FALSE	FALSE
+22	32831540	T	C	0.001868495	FALSE	FALSE
+22	32832874	T	C	6.028815E-05	FALSE	FALSE
+22	32853660	G	A	0.0138221	FALSE	FALSE
+22	32854391	C	A	0.0001960825	FALSE	FALSE
+22	32875190	A	G	-0.006426637	FALSE	FALSE
+22	32934713	C	CT	-0.009057754	FALSE	FALSE
+22	32952012	A	C	-0.00380248	FALSE	FALSE
+22	32954443	G	A	0.002210369	FALSE	FALSE
+22	32993032	C	T	-0.002429979	FALSE	FALSE
+22	32997766	T	C	-0.008424246	FALSE	FALSE
+22	33045573	T	C	-0.03107145	FALSE	FALSE
+22	33046110	G	C	-0.06954732	FALSE	FALSE
+22	33048039	T	C	0.01138346	FALSE	FALSE
+22	33056341	C	T	-0.06477198	FALSE	FALSE
+22	33108536	T	C	-0.03426392	FALSE	FALSE
+22	33108981	T	C	-0.07404035	FALSE	FALSE
+22	33116435	T	C	0.06542471	FALSE	FALSE
+22	33143528	G	A	0.02195059	FALSE	FALSE
+22	33146363	A	G	0.000810539	FALSE	FALSE
+22	33259625	C	T	0.02309793	FALSE	FALSE
+22	33336039	T	G	-0.02554387	FALSE	FALSE
+22	33408519	T	C	-0.0075563	FALSE	FALSE
+22	33660345	C	G	0.002190743	FALSE	FALSE
+22	33804893	C	T	0.006680774	FALSE	FALSE
+22	33844303	C	T	0.008923314	FALSE	FALSE
+22	33846914	T	C	0.006295378	FALSE	FALSE
+22	33898906	A	C	1.958759E-05	FALSE	FALSE
+22	34022284	A	G	-0.00257933	FALSE	FALSE
+22	34137784	G	A	0.004460828	FALSE	FALSE
+22	34208570	T	C	-0.003365869	FALSE	FALSE
+22	34217757	T	C	0.009289431	FALSE	FALSE
+22	34256923	A	C	0.01439384	FALSE	FALSE
+22	34265402	G	A	-0.0163661	FALSE	FALSE
+22	34284173	G	A	-0.02315559	FALSE	FALSE
+22	34296093	C	A	-0.004688326	FALSE	FALSE
+22	34378012	A	G	0.002276664	FALSE	FALSE
+22	34436795	C	T	0.0001337033	FALSE	FALSE
+22	34488452	A	G	-0.000428831	FALSE	FALSE
+22	34501541	A	G	0.002763614	FALSE	FALSE
+22	34514810	C	A	0.003976601	FALSE	FALSE
+22	34526428	C	T	0.01088864	FALSE	FALSE
+22	34583078	A	G	0.001802495	FALSE	FALSE
+22	34620754	T	C	0.01466546	FALSE	FALSE
+22	34691035	A	G	-0.0002082615	FALSE	FALSE
+22	34758540	T	C	0.005165532	FALSE	FALSE
+22	34851377	A	C	0.0137118	FALSE	FALSE
+22	35371707	T	C	-0.0004985554	FALSE	FALSE
+22	35382268	A	C	-0.004931336	FALSE	FALSE
+22	35419122	C	T	-0.01077953	FALSE	FALSE
+22	35478529	A	G	0.0001760523	FALSE	FALSE
+22	35481493	T	C	0.01056439	FALSE	FALSE
+22	35526281	G	A	-0.002766891	FALSE	FALSE
+22	35603836	A	G	-0.0001783939	FALSE	FALSE
+22	35660875	T	G	0.03988231	FALSE	FALSE
+22	35745196	G	T	0.0001750545	FALSE	FALSE
+22	35750980	A	G	-0.007651136	FALSE	FALSE
+22	35783413	G	A	0.001649791	FALSE	FALSE
+22	35918270	C	T	0.006918713	FALSE	FALSE
+22	35959242	A	G	0.01697538	FALSE	FALSE
+22	35962060	G	A	0.005181476	FALSE	FALSE
+22	35964158	G	C	0.002769931	FALSE	FALSE
+22	35984385	A	G	-0.01280623	FALSE	FALSE
+22	36001258	C	T	0.01342405	FALSE	FALSE
+22	36072262	T	C	0.00489549	FALSE	FALSE
+22	36180535	G	A	-0.03250252	FALSE	FALSE
+22	36517307	C	T	0.01366076	FALSE	FALSE
+22	36519596	A	C	-0.00349956	FALSE	FALSE
+22	36532058	A	G	-0.01214487	FALSE	FALSE
+22	36543489	C	G	0.007838149	FALSE	FALSE
+22	36600841	G	A	0.02644389	FALSE	FALSE
+22	36629633	C	A	-0.006871468	FALSE	FALSE
+22	36635967	G	A	-0.02634742	FALSE	FALSE
+22	36655735	A	G	-0.005385142	FALSE	FALSE
+22	36661646	A	G	-0.01560741	FALSE	FALSE
+22	36684354	C	T	-0.005170111	FALSE	FALSE
+22	36705622	A	G	0.01713234	FALSE	FALSE
+22	36708049	C	CTCCTGTGA	-0.05187051	FALSE	FALSE
+22	36751101	A	C	-0.0244065	FALSE	FALSE
+22	36764788	G	A	0.02784116	FALSE	FALSE
+22	36897427	C	T	0.02603792	FALSE	FALSE
+22	36900806	G	A	0.007366207	FALSE	FALSE
+22	36923144	T	C	-0.001875563	FALSE	FALSE
+22	36924714	G	A	-0.003632594	FALSE	FALSE
+22	36946643	T	G	0.01333137	FALSE	FALSE
+22	36954939	T	C	0.01105894	FALSE	FALSE
+22	36998907	T	C	-0.0006084687	FALSE	FALSE
+22	37001495	G	T	-0.01224147	FALSE	FALSE
+22	37013167	G	C	0.01866849	FALSE	FALSE
+22	37077364	C	T	0.007294257	FALSE	FALSE
+22	37080738	C	G	-0.004873355	FALSE	FALSE
+22	37101890	C	T	0.03991764	FALSE	FALSE
+22	37118535	A	G	-0.001713909	FALSE	FALSE
+22	37184521	G	A	0.006515894	FALSE	FALSE
+22	37206341	G	T	0.0002566936	FALSE	FALSE
+22	37256262	A	G	0.001152626	FALSE	FALSE
+22	37258503	C	T	-0.009761102	FALSE	FALSE
+22	37323988	T	C	-0.0073182	FALSE	FALSE
+22	37329545	G	A	0.005775806	FALSE	FALSE
+22	37337409	T	C	-0.02534399	FALSE	FALSE
+22	37343000	A	C	-0.0004011777	FALSE	FALSE
+22	37398195	T	C	-0.01001198	FALSE	FALSE
+22	37401532	A	G	-0.003244795	FALSE	FALSE
+22	37407109	C	G	0.04335972	FALSE	FALSE
+22	37477732	T	C	0.0003669548	FALSE	FALSE
+22	37507019	A	G	-0.0009259451	FALSE	FALSE
+22	37513316	A	G	0.001153887	FALSE	FALSE
+22	37532441	A	G	0.01802306	FALSE	FALSE
+22	37571497	G	A	-0.005785311	FALSE	FALSE
+22	37581383	T	C	0.03172492	FALSE	FALSE
+22	37621269	C	A	0.004460405	FALSE	FALSE
+22	37644621	T	C	-0.008386907	FALSE	FALSE
+22	37671896	A	G	0.02303688	FALSE	FALSE
+22	37679763	G	A	-0.002658396	FALSE	FALSE
+22	37720268	G	A	0.02120184	FALSE	FALSE
+22	37753256	C	T	0.008984539	FALSE	FALSE
+22	37757099	G	A	-0.01560347	FALSE	FALSE
+22	37780522	C	G	-0.01496708	FALSE	FALSE
+22	37800175	T	C	-0.005510833	FALSE	FALSE
+22	37846448	G	A	0.01152963	FALSE	FALSE
+22	37896749	C	T	0.005447068	FALSE	FALSE
+22	37908435	C	T	0.001909131	FALSE	FALSE
+22	37977481	T	C	0.01465308	FALSE	FALSE
+22	37992699	G	A	0.0008339179	FALSE	FALSE
+22	38032762	G	GA	0.01693041	FALSE	FALSE
+22	38054262	C	A	0.04354146	FALSE	FALSE
+22	38083101	C	T	-0.02092117	FALSE	FALSE
+22	38119213	A	G	0.03948165	FALSE	FALSE
+22	38122122	C	T	0.04377277	FALSE	FALSE
+22	38204089	T	C	0.02977743	FALSE	FALSE
+22	38435786	T	G	-0.007684278	FALSE	FALSE
+22	38544298	G	A	0.05090446	FALSE	FALSE
+22	38597378	T	G	-0.01997927	FALSE	FALSE
+22	38606780	G	A	-0.009182016	FALSE	FALSE
+22	38630272	C	T	0.007393137	FALSE	FALSE
+22	38663819	G	A	-0.006392021	FALSE	FALSE
+22	38673234	A	G	-0.01106705	FALSE	FALSE
+22	38685131	C	T	-0.004493352	FALSE	FALSE
+22	38695406	T	C	-0.01155972	FALSE	FALSE
+22	38708506	A	G	0.01701713	FALSE	FALSE
+22	38744184	C	T	-0.02112956	FALSE	FALSE
+22	38819613	A	G	-0.005625806	FALSE	FALSE
+22	38877461	G	T	0.001108728	FALSE	FALSE
+22	38918894	G	T	-0.008094286	FALSE	FALSE
+22	38928269	G	T	-0.02114917	FALSE	FALSE
+22	39027286	C	CAG	0.003840735	FALSE	FALSE
+22	39067524	G	A	0.01200232	FALSE	FALSE
+22	39159201	C	T	0.003096214	FALSE	FALSE
+22	39178701	G	A	0.002148449	FALSE	FALSE
+22	39260032	T	C	0.03574634	FALSE	FALSE
+22	39268785	T	G	0.009377414	FALSE	FALSE
+22	39281774	G	T	0.03816951	FALSE	FALSE
+22	39300265	C	T	0.03540156	FALSE	FALSE
+22	39332623	T	C	-0.004449842	FALSE	FALSE
+22	39415780	G	A	0.01479946	FALSE	FALSE
+22	39448465	A	G	0.003065974	FALSE	FALSE
+22	39480697	G	A	-0.04005617	FALSE	FALSE
+22	39487665	G	A	-0.0001218988	FALSE	FALSE
+22	39493294	C	T	-0.03115929	FALSE	FALSE
+22	39510995	G	A	-0.02069106	FALSE	FALSE
+22	39542292	A	G	0.009653575	FALSE	FALSE
+22	39543000	T	C	-0.004069841	FALSE	FALSE
+22	39573724	A	C	0.02683694	FALSE	FALSE
+22	39575692	A	C	0.01451305	FALSE	FALSE
+22	39581277	A	C	0.01766406	FALSE	FALSE
+22	39626572	A	G	-0.02901981	FALSE	FALSE
+22	39658626	C	T	0.004177065	FALSE	FALSE
+22	39665395	G	A	0.01264611	FALSE	FALSE
+22	39687484	G	A	0.005418141	FALSE	FALSE
+22	39708279	A	G	-0.04281532	FALSE	FALSE
+22	39708357	T	C	0.008605574	FALSE	FALSE
+22	39793066	G	T	0.03658209	FALSE	FALSE
+22	39798127	G	A	0.002302129	FALSE	FALSE
+22	39843409	T	C	0.01065699	FALSE	FALSE
+22	39865475	G	A	0.001588501	FALSE	FALSE
+22	39932516	A	G	-0.01179841	FALSE	FALSE
+22	39963426	G	A	-0.01503908	FALSE	FALSE
+22	40023636	C	T	0.006443146	FALSE	FALSE
+22	40046176	C	T	-0.0007416552	FALSE	FALSE
+22	40067818	T	C	0.00455936	FALSE	FALSE
+22	40092864	G	A	0.02400297	FALSE	FALSE
+22	40127293	T	C	-0.0008870038	FALSE	FALSE
+22	40358148	T	C	-0.01079902	FALSE	FALSE
+22	40420786	G	C	-0.008092115	FALSE	FALSE
+22	40454069	G	T	0.00789888	FALSE	FALSE
+22	40541981	G	A	0.0174264	FALSE	FALSE
+22	40652873	G	A	0.005853057	FALSE	FALSE
+22	40676672	G	T	-0.001894274	FALSE	FALSE
+22	40729614	G	A	0.0195994	FALSE	FALSE
+22	40820151	C	T	-0.01628066	FALSE	FALSE
+22	40986372	G	C	-0.01983507	FALSE	FALSE
+22	41494925	A	G	-0.02918069	FALSE	FALSE
+22	41646738	G	A	0.0003521847	FALSE	FALSE
+22	41680898	T	C	0.01402732	FALSE	FALSE
+22	41704872	T	C	6.681484E-05	FALSE	FALSE
+22	41791536	C	T	-5.572333E-05	FALSE	FALSE
+22	41895409	A	G	-0.04407217	FALSE	FALSE
+22	41929175	G	T	-0.03186844	FALSE	FALSE
+22	42089623	C	T	0.00532234	FALSE	FALSE
+22	42095658	G	T	0.03846131	FALSE	FALSE
+22	42210985	C	T	-0.00313971	FALSE	FALSE
+22	42279653	G	A	-0.006596336	FALSE	FALSE
+22	42341308	G	A	-0.0006862491	FALSE	FALSE
+22	42524243	C	CT	-0.01181191	FALSE	FALSE
+22	42672124	G	A	-0.005278171	FALSE	FALSE
+22	42691238	T	C	-0.01642396	FALSE	FALSE
+22	42813753	C	T	-0.00386775	FALSE	FALSE
+22	42867898	G	A	-0.001352327	FALSE	FALSE
+22	42912097	T	C	-0.0007295657	FALSE	FALSE
+22	42932317	A	G	-0.05768556	FALSE	FALSE
+22	43010817	A	G	0.01722077	FALSE	FALSE
+22	43080028	T	C	-0.0005527551	FALSE	FALSE
+22	43096507	T	C	-0.005556102	FALSE	FALSE
+22	43112475	T	C	-0.01350273	FALSE	FALSE
+22	43114824	G	A	-0.01963192	FALSE	FALSE
+22	43115576	C	T	-0.01880097	FALSE	FALSE
+22	43154299	G	A	-0.001621113	FALSE	FALSE
+22	43159948	T	C	-0.007980584	FALSE	FALSE
+22	43206950	C	A	-0.005783037	FALSE	FALSE
+22	43218397	C	T	-0.003976636	FALSE	FALSE
+22	43283255	C	A	-0.01426668	FALSE	FALSE
+22	43290583	C	T	-0.03955775	FALSE	FALSE
+22	43333156	A	G	-0.03127845	FALSE	FALSE
+22	43426262	G	A	-0.00366804	FALSE	FALSE
+22	43483242	T	C	-0.02540203	FALSE	FALSE
+22	43515108	C	T	-0.01570749	FALSE	FALSE
+22	43529314	C	G	0.01738127	FALSE	FALSE
+22	43551513	G	A	0.02565386	FALSE	FALSE
+22	43558972	A	G	-0.01962819	FALSE	FALSE
+22	43577214	T	C	-0.02270478	FALSE	FALSE
+22	43579049	C	T	-0.001193909	FALSE	FALSE
+22	43610207	G	A	-0.007621661	FALSE	FALSE
+22	43623395	G	C	-0.04852519	FALSE	FALSE
+22	43640512	C	T	-0.005533207	FALSE	FALSE
+22	43649701	C	T	0.07724845	FALSE	FALSE
+22	43661080	T	C	-0.04251741	FALSE	FALSE
+22	43683088	A	G	-0.003582388	FALSE	FALSE
+22	43707996	A	G	-0.02547044	FALSE	FALSE
+22	43711080	C	G	-0.005784446	FALSE	FALSE
+22	43721519	C	A	0.000365885	FALSE	FALSE
+22	43729401	C	T	0.008557013	FALSE	FALSE
+22	43763757	T	G	-0.0178981	FALSE	FALSE
+22	43836198	G	T	0.002427697	FALSE	FALSE
+22	43976396	A	G	-0.01277457	FALSE	FALSE
+22	44031042	C	T	0.003593107	FALSE	FALSE
+22	44193626	C	A	-0.006865434	FALSE	FALSE
+22	44221247	G	A	0.01833991	FALSE	FALSE
+22	44296372	T	C	0.006169212	FALSE	FALSE
+22	44298838	A	G	0.007441756	FALSE	FALSE
+22	44342116	G	A	0.02810328	FALSE	FALSE
+22	44368122	G	A	0.0129968	FALSE	FALSE
+22	44379838	G	A	0.001648422	FALSE	FALSE
+22	44380033	C	T	-0.002136788	FALSE	FALSE
+22	44395451	C	T	-0.006698507	FALSE	FALSE
+22	44419871	C	T	0.0181613	FALSE	FALSE
+22	44424108	T	C	0.01036733	FALSE	FALSE
+22	44467899	C	T	-0.002592364	FALSE	FALSE
+22	44498134	T	C	0.007281423	FALSE	FALSE
+22	44522312	C	T	-0.0002636447	FALSE	FALSE
+22	44526130	G	A	-0.00388298	FALSE	FALSE
+22	44530286	A	G	0.02528159	FALSE	FALSE
+22	44530420	C	T	-0.01233654	FALSE	FALSE
+22	44548944	G	A	-0.003947209	FALSE	FALSE
+22	44551755	G	A	0.01262458	FALSE	FALSE
+22	44566434	A	G	-0.004290306	FALSE	FALSE
+22	44581046	T	C	-0.0147995	FALSE	FALSE
+22	44643161	C	T	0.01439493	FALSE	FALSE
+22	44677081	C	T	-0.01030513	FALSE	FALSE
+22	44681612	G	A	-0.001269762	FALSE	FALSE
+22	44695088	T	C	0.006324859	FALSE	FALSE
+22	44707716	G	T	0.002288939	FALSE	FALSE
+22	44725343	G	A	0.003534678	FALSE	FALSE
+22	44738406	G	A	0.02320049	FALSE	FALSE
+22	44746729	A	G	-0.01754216	FALSE	FALSE
+22	44751158	G	A	-0.006539695	FALSE	FALSE
+22	44757439	A	G	0.02480295	FALSE	FALSE
+22	44759519	G	A	0.002111274	FALSE	FALSE
+22	44761797	A	T	-0.00531172	FALSE	FALSE
+22	44763352	C	G	0.01452737	FALSE	FALSE
+22	44783779	G	A	0.009142699	FALSE	FALSE
+22	44791807	C	T	-0.02371876	FALSE	FALSE
+22	44818986	C	T	-0.006740622	FALSE	FALSE
+22	44894913	G	A	-5.179871E-05	FALSE	FALSE
+22	45058431	C	T	0.01098259	FALSE	FALSE
+22	45066035	A	G	-0.01484374	FALSE	FALSE
+22	45069410	T	C	0.01530441	FALSE	FALSE
+22	45081330	G	A	0.00135012	FALSE	FALSE
+22	45082168	C	A	0.003663354	FALSE	FALSE
+22	45090008	G	A	0.002811861	FALSE	FALSE
+22	45116664	C	T	0.01247728	FALSE	FALSE
+22	45244930	T	C	-0.01450041	FALSE	FALSE
+22	45258457	G	A	-0.003500519	FALSE	FALSE
+22	45323989	T	C	0.001111338	FALSE	FALSE
+22	45415987	A	G	-0.01398184	FALSE	FALSE
+22	45451355	G	A	-0.005566982	FALSE	FALSE
+22	45471607	C	T	0.01148978	FALSE	FALSE
+22	45497738	C	T	-0.005029327	FALSE	FALSE
+22	45502829	C	T	-0.03893521	FALSE	FALSE
+22	45519040	T	G	0.002377071	FALSE	FALSE
+22	45523391	A	G	0.01318997	FALSE	FALSE
+22	45573450	C	A	0.0043856	FALSE	FALSE
+22	45589490	G	A	-0.008350439	FALSE	FALSE
+22	45668012	T	C	0.01286879	FALSE	FALSE
+22	45671343	G	A	-2.940682E-06	FALSE	FALSE
+22	45672574	T	C	0.005743608	FALSE	FALSE
+22	45693923	A	G	-0.002675069	FALSE	FALSE
+22	45718743	G	A	-0.02092804	FALSE	FALSE
+22	45723807	C	G	0.001670159	FALSE	FALSE
+22	45728370	A	G	0.0001879231	FALSE	FALSE
+22	45741537	G	T	0.01420045	FALSE	FALSE
+22	45749983	T	G	-0.04591012	FALSE	FALSE
+22	45809624	A	C	0.002185772	FALSE	FALSE
+22	45821935	A	G	0.02250782	FALSE	FALSE
+22	45837410	G	A	-0.002756449	FALSE	FALSE
+22	45846371	T	C	0.07910102	FALSE	FALSE
+22	45864934	T	C	0.008535181	FALSE	FALSE
+22	45871507	G	C	-0.007764056	FALSE	FALSE
+22	45892656	G	T	-0.003885653	FALSE	FALSE
+22	45897997	C	T	0.0003935204	FALSE	FALSE
+22	45929577	C	T	-0.02532217	FALSE	FALSE
+22	45936350	A	G	-0.008001698	FALSE	FALSE
+22	45942726	T	G	-0.01415551	FALSE	FALSE
+22	45996298	G	A	0.05643525	FALSE	FALSE
+22	46009063	G	A	0.006464843	FALSE	FALSE
+22	46022070	G	A	0.0224674	FALSE	FALSE
+22	46155548	G	C	-0.0324747	FALSE	FALSE
+22	46207955	C	T	-0.001354554	FALSE	FALSE
+22	46236425	A	G	0.08398423	FALSE	FALSE
+22	46275529	T	C	0.0022643	FALSE	FALSE
+22	46287720	A	G	-0.02237482	FALSE	FALSE
+22	46289699	T	C	0.01872124	FALSE	FALSE
+22	46303347	T	C	-0.01283734	FALSE	FALSE
+22	46316057	A	G	0.02312579	FALSE	FALSE
+22	46337043	G	C	0.01701173	FALSE	FALSE
+22	46347519	C	T	0.01574289	FALSE	FALSE
+22	46364161	A	G	-0.04466341	FALSE	FALSE
+22	46381234	G	A	0.04730559	FALSE	FALSE
+22	46396925	G	A	0.001783944	FALSE	FALSE
+22	46403715	A	G	-0.02132589	FALSE	FALSE
+22	46406782	A	C	0.08439466	FALSE	FALSE
+22	46445002	G	C	-0.07613496	FALSE	FALSE
+22	46458123	G	T	0.03328073	FALSE	FALSE
+22	46482948	C	T	0.04241879	FALSE	FALSE
+22	46486508	C	T	-0.00968439	FALSE	FALSE
+22	46493852	T	C	-0.00675858	FALSE	FALSE
+22	46499120	C	G	-0.009873118	FALSE	FALSE
+22	46502870	T	C	-0.0179214	FALSE	FALSE
+22	46561713	G	A	0.02604703	FALSE	FALSE
+22	46586110	A	G	-0.001256735	FALSE	FALSE
+22	46592168	C	T	0.01417055	FALSE	FALSE
+22	46614274	G	C	-0.05854014	FALSE	FALSE
+22	46627603	T	C	0.08004024	FALSE	FALSE
+22	46760086	T	C	0.003229515	FALSE	FALSE
+22	46782382	T	C	-0.02470821	FALSE	FALSE
+22	46807234	C	T	0.002324176	FALSE	FALSE
+22	46837114	G	A	0.000944073	FALSE	FALSE
+22	46888399	T	C	0.009911095	FALSE	FALSE
+22	46907779	G	A	0.00653144	FALSE	FALSE
+22	46909355	T	G	-0.004780494	FALSE	FALSE
+22	46914277	A	C	0.009689535	FALSE	FALSE
+22	46943687	G	A	-0.0130366	FALSE	FALSE
+22	46985917	A	G	0.01893397	FALSE	FALSE
+22	47021226	G	A	-0.01322949	FALSE	FALSE
+22	47095235	A	C	-0.1156013	FALSE	FALSE
+22	47109621	C	T	0.0004322858	FALSE	FALSE
+22	47125474	G	A	-0.01746025	FALSE	FALSE
+22	47147117	T	C	-0.02418349	FALSE	FALSE
+22	47156703	C	T	0.0262897	FALSE	FALSE
+22	47245836	A	G	0.001880575	FALSE	FALSE
+22	47271747	C	T	0.001055264	FALSE	FALSE
+22	47301822	C	T	0.003032158	FALSE	FALSE
+22	47345487	T	C	-0.002945945	FALSE	FALSE
+22	47372368	T	C	0.02067644	FALSE	FALSE
+22	47380606	C	T	0.04041426	FALSE	FALSE
+22	47437808	C	T	0.001683027	FALSE	FALSE
+22	47450911	A	G	0.01624479	FALSE	FALSE
+22	47511864	A	C	-0.004226735	FALSE	FALSE
+22	47519476	T	C	-0.003954111	FALSE	FALSE
+22	47529458	A	G	-0.0003602848	FALSE	FALSE
+22	47531320	T	C	-0.006899703	FALSE	FALSE
+22	47548321	T	C	0.004925401	FALSE	FALSE
+22	47568291	C	T	0.007726693	FALSE	FALSE
+22	47571203	A	G	-0.009744751	FALSE	FALSE
+22	47574009	C	T	-0.00532701	FALSE	FALSE
+22	47642100	T	C	0.006976251	FALSE	FALSE
+22	47657635	T	C	0.001798943	FALSE	FALSE
+22	47683805	C	T	-0.03475544	FALSE	FALSE
+22	47720973	T	C	-0.007868172	FALSE	FALSE
+22	47821952	G	A	-0.000885428	FALSE	FALSE
+22	47893053	A	G	-0.02449056	FALSE	FALSE
+22	47935365	C	T	-0.001599879	FALSE	FALSE
+22	47961708	G	T	-0.003593525	FALSE	FALSE
+22	47986332	T	C	-0.003976592	FALSE	FALSE
+22	48154645	C	T	0.007608639	FALSE	FALSE
+22	48165452	C	CT	0.002039503	FALSE	FALSE
+22	48207318	T	C	-0.009725168	FALSE	FALSE
+22	48213904	G	C	-0.01220367	FALSE	FALSE
+22	48215904	A	G	-2.488244E-05	FALSE	FALSE
+22	48220460	T	C	-0.002702163	FALSE	FALSE
+22	48230941	C	A	-0.001129522	FALSE	FALSE
+22	48271961	A	G	-0.005053446	FALSE	FALSE
+22	48284025	T	C	-0.003344182	FALSE	FALSE
+22	48297953	C	T	-0.01046958	FALSE	FALSE
+22	48362290	G	A	-0.02367254	FALSE	FALSE
+22	48362914	C	A	-0.003167719	FALSE	FALSE
+22	48387670	A	G	-0.008243989	FALSE	FALSE
+22	48415446	C	T	0.002130715	FALSE	FALSE
+22	48460730	T	C	0.002682476	FALSE	FALSE
+22	48491160	T	C	0.001257794	FALSE	FALSE
+22	48519794	C	T	0.003680757	FALSE	FALSE
+22	48537775	G	A	0.002134692	FALSE	FALSE
+22	48543566	T	C	0.007314089	FALSE	FALSE
+22	48593037	C	T	0.009084708	FALSE	FALSE
+22	48687509	C	T	-0.0277196	FALSE	FALSE
+22	48692033	T	C	-0.02126264	FALSE	FALSE
+22	48699617	T	C	0.0005093107	FALSE	FALSE
+22	48717568	T	C	-0.0008190281	FALSE	FALSE
+22	48811946	C	T	0.007916515	FALSE	FALSE
+22	48823357	G	A	0.01464317	FALSE	FALSE
+22	48840428	A	C	0.003711229	FALSE	FALSE
+22	48851612	T	C	-0.005887765	FALSE	FALSE
+22	48874310	T	C	-0.01106607	FALSE	FALSE
+22	48968070	C	T	0.01280691	FALSE	FALSE
+22	48991385	T	C	-0.01234119	FALSE	FALSE
+22	49004050	G	A	0.02290755	FALSE	FALSE
+22	49014565	A	G	0.001555565	FALSE	FALSE
+22	49086481	T	C	-0.006196369	FALSE	FALSE
+22	49107173	T	C	0.01277272	FALSE	FALSE
+22	49180915	A	G	0.006346977	FALSE	FALSE
+22	49262579	A	G	0.02657134	FALSE	FALSE
+22	49270317	C	T	0.001447665	FALSE	FALSE
+22	49313196	A	G	-0.007055532	FALSE	FALSE
+22	49335230	T	C	-0.006548281	FALSE	FALSE
+22	49366123	T	C	0.01136486	FALSE	FALSE
+22	49372356	G	C	-0.02420841	FALSE	FALSE
+22	49443666	T	C	0.01581736	FALSE	FALSE
+22	49496835	G	A	-0.01355414	FALSE	FALSE
+22	49524428	A	G	-0.004228482	FALSE	FALSE
+22	49530553	G	C	0.008197389	FALSE	FALSE
+22	49537845	T	C	0.0111255	FALSE	FALSE
+22	49557457	G	A	0.009401926	FALSE	FALSE
+22	49562666	C	A	0.01271701	FALSE	FALSE
+22	49574509	C	T	0.0004703177	FALSE	FALSE
+22	49579141	A	G	0.02448619	FALSE	FALSE
+22	49650863	T	C	0.006739571	FALSE	FALSE
+22	49662549	T	G	-0.005769464	FALSE	FALSE
+22	49665841	T	C	-0.0007037069	FALSE	FALSE
+22	49677464	A	G	-0.02177735	FALSE	FALSE
+22	49696067	C	T	-0.003309682	FALSE	FALSE
+22	49700272	T	G	-0.002541948	FALSE	FALSE
+22	49706433	T	C	-0.01719402	FALSE	FALSE
+22	49713835	G	A	-0.01370754	FALSE	FALSE
+22	49719264	A	C	-0.01067852	FALSE	FALSE
+22	49743627	G	A	-0.0005970581	FALSE	FALSE
+22	49800265	C	T	0.03098582	FALSE	FALSE
+22	49806863	A	G	0.003940447	FALSE	FALSE
+22	49830851	C	T	-0.002742706	FALSE	FALSE
+22	49834624	G	A	-0.002820163	FALSE	FALSE
+22	49843235	G	C	-0.0004458281	FALSE	FALSE
+22	49847501	T	G	0.002235016	FALSE	FALSE
+22	49861033	C	T	0.01721243	FALSE	FALSE
+22	49881321	A	G	-0.00051278	FALSE	FALSE
+22	49908804	G	A	-0.009455892	FALSE	FALSE
+22	49911222	G	T	-0.01389666	FALSE	FALSE
+22	49925268	A	G	0.01679984	FALSE	FALSE
+22	49927332	T	C	0.00039298	FALSE	FALSE
+22	50109212	T	C	0.01610819	FALSE	FALSE
+22	50118149	G	C	0.007024666	FALSE	FALSE
+22	50184484	G	T	0.01222581	FALSE	FALSE
+22	50219447	T	C	0.05091891	FALSE	FALSE
+22	50278568	G	A	-0.02340672	FALSE	FALSE
+22	50319170	G	A	0.01669806	FALSE	FALSE
+22	50350971	A	G	0.0264016	FALSE	FALSE
+22	50356693	C	T	0.003851499	FALSE	FALSE
+22	50435480	G	A	0.0166363	FALSE	FALSE
+22	50439626	A	G	-0.002722154	FALSE	FALSE
+22	50466542	C	T	-0.002560094	FALSE	FALSE
+22	50470516	T	C	-0.01621986	FALSE	FALSE
+22	50491150	G	A	0.01828674	FALSE	FALSE
+22	50515270	C	T	0.01439904	FALSE	FALSE
+22	50529850	C	T	0.02054628	FALSE	FALSE
+22	50570755	C	G	0.007077514	FALSE	FALSE
+22	50582626	G	A	-0.003588854	FALSE	FALSE
+22	50672154	A	G	0.007660848	FALSE	FALSE
+22	50722134	C	T	-0.01747164	FALSE	FALSE
+22	50722408	C	T	-0.001063465	FALSE	FALSE
+22	50728062	C	T	0.02159223	FALSE	FALSE
+22	50750481	T	C	0.01877272	FALSE	FALSE
+22	50758873	T	C	0.004001731	FALSE	FALSE
+22	50835040	A	G	-0.006374259	FALSE	FALSE
+22	50859049	C	T	0.0003480749	FALSE	FALSE
+22	50885775	G	A	-0.01358311	FALSE	FALSE
+22	50926768	T	C	0.001798498	FALSE	FALSE
+22	50928026	A	G	0.004775504	FALSE	FALSE
+22	50971266	C	T	0.02160893	FALSE	FALSE
+22	50989197	T	C	-0.01328884	FALSE	FALSE
+22	50989326	G	A	0.01037054	FALSE	FALSE
+22	50999681	G	A	-0.01226224	FALSE	FALSE
+22	51046163	T	C	-0.02754002	FALSE	FALSE
+22	51117580	C	T	0.03573542	FALSE	FALSE
+22	51171497	A	G	-0.01951606	FALSE	FALSE
+22	51174939	T	C	-0.006178519	FALSE	FALSE
diff --git a/tests/test_combine.py b/tests/test_combine.py
index db92cc9..edfa2c2 100644
--- a/tests/test_combine.py
+++ b/tests/test_combine.py
@@ -1,39 +1,135 @@
+import importlib.resources
+import json
 from unittest.mock import patch
 
-import jq
-import pandas as pd
 import pytest
 
-from pgscatalog_utils.download.Catalog import CatalogQuery, CatalogResult
-from pgscatalog_utils.download.CatalogCategory import CatalogCategory
 from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
+from tests.data import combine
 
 
-def test_combine_scorefiles(combined_scorefile, _n_variants):
-    df = pd.read_table(combined_scorefile)
-    cols = {'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type',
-            'is_duplicated', 'accession', 'row_nr'}
-    assert set(df.columns).issubset(cols)
-    assert df.shape[0] == _n_variants
+def test_pgscatalog_combine(pgscatalog_path, tmp_path_factory, combine_output_header):
+    out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
+    args: list[str] = (
+        ["combine_scorefiles", "-t", "GRCh37", "-s"]
+        + [str(pgscatalog_path)]
+        + ["-o", str(out_path.resolve())]
+    )
+    with patch("sys.argv", args):
+        combine_scorefiles()
 
+    with open(out_path) as f:
+        for i, line in enumerate(f):
+            if i == 0:
+                cols = line.strip().split("\t")
+            else:
+                break
+        assert not set(cols).difference(set(combine_output_header))
 
-def test_liftover(lifted_scorefiles):
-    df = pd.read_table(lifted_scorefiles)
-    assert df.shape[0] == 832  # approx size
+    with open(out_path.parent / "log_combined.json") as f:
+        header = json.load(f)[0]
+        assert header["PGS001229_22"]["pgs_id"] == "PGS001229"
+        assert header["PGS001229_22"]["pgs_name"] == "GBE_INI50"
+        assert header["PGS001229_22"]["trait_mapped"] == ["body height"]
+        assert header["PGS001229_22"]["trait_efo"] == ["EFO_0004339"]
+        assert header["PGS001229_22"]["genome_build"] == "GRCh37"
+        assert not header["PGS001229_22"]["use_harmonised"]
 
 
-def test_fail_combine(scorefiles, tmp_path_factory):
+def test_effect_type_combine(effect_type_path, tmp_path_factory, combine_output_header):
     # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception
-    with pytest.raises(Exception):
-        out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
-        args: list[str] = ['combine_scorefiles', '-t', 'GRCh38', '-s'] + scorefiles + ['-o', str(out_path.resolve())]
-        with patch('sys.argv', args):
-            combine_scorefiles()
+    out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
+    args: list[str] = (
+        ["combine_scorefiles", "-t", "GRCh37", "-s"]
+        + [str(effect_type_path)]
+        + ["-o", str(out_path.resolve())]
+    )
+    with patch("sys.argv", args):
+        combine_scorefiles()
+
+    n = -1  # skip header line
+    with open(out_path) as f:
+        for i, line in enumerate(f):
+            if i == 0:
+                cols = line.strip().split("\t")
+
+            if i == 1:
+                assert line.strip().split("\t")[-3] == "dominant"
+
+            if i == 2:
+                assert line.strip().split("\t")[-3] == "recessive"
+
+            n += 1
+
+        assert not set(cols).difference(set(combine_output_header))
+
+    with open(out_path.parent / "log_combined.json") as f:
+        header = json.load(f)[0]
+        assert (
+            header["scorefile_dominant_and_recessive"]["pgs_name"]
+            == "PGS001229_22_DominantRecessiveExample"
+        )
+        assert header["scorefile_dominant_and_recessive"]["genome_build"] == "GRCh37"
+        assert header["scorefile_dominant_and_recessive"]["variants_number"] == n
+        assert not header["scorefile_dominant_and_recessive"]["use_harmonised"]
+
+
+def test_custom_combine(custom_score_path, tmp_path_factory, combine_output_header):
+    # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception
+    out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
+    args: list[str] = (
+        ["combine_scorefiles", "-t", "GRCh37", "-s"]
+        + [str(custom_score_path)]
+        + ["-o", str(out_path.resolve())]
+    )
+
+    with patch("sys.argv", args):
+        combine_scorefiles()
+
+    # read combined file
+    n = -1  # skip header line
+    with open(out_path) as f:
+        for i, line in enumerate(f):
+            if i == 0:
+                cols = line.strip().split("\t")
+            n += 1
+        assert not set(cols).difference(set(combine_output_header))
+
+    with open(out_path.parent / "log_combined.json") as f:
+        header = json.load(f)[0]
+        assert header["scorefile"]["pgs_name"] == "PGS001229_22"
+        assert header["scorefile"]["genome_build"] == "GRCh37"
+        assert header["scorefile"]["variants_number"] == n
+        assert not header["scorefile"]["use_harmonised"]
+
+
+@pytest.fixture
+def pgscatalog_path(scope="session"):
+    path = importlib.resources.files(combine) / "PGS001229_22.txt"
+    return path
 
 
 @pytest.fixture
-def _n_variants(pgs_accessions):
-    result = CatalogQuery(CatalogCategory.SCORE, accession=pgs_accessions).get()[0]
-    json = result.response
-    n: list[int] = jq.compile("[.results][][].variants_number").input(json).all()
-    return sum(n)
+def effect_type_path(scope="session"):
+    path = importlib.resources.files(combine) / "scorefile_dominant_and_recessive.txt"
+    return path
+
+
+@pytest.fixture(scope="session")
+def custom_score_path(tmp_path_factory):
+    path = importlib.resources.files(combine) / "scorefile.txt"
+    return path
+
+
+@pytest.fixture(scope="session")
+def combine_output_header():
+    return [
+        "chr_name",
+        "chr_position",
+        "effect_allele",
+        "other_allele",
+        "effect_weight",
+        "effect_type",
+        "accession",
+        "row_nr",
+    ]
diff --git a/tests/test_liftover.py b/tests/test_liftover.py
index b2f03a0..71f5d5f 100644
--- a/tests/test_liftover.py
+++ b/tests/test_liftover.py
@@ -1,9 +1,41 @@
-import pandas as pd
+import copy
 
-from pgscatalog_utils.scorefile.liftover import liftover
+from pgscatalog_utils.scorefile.config import Config
+
+from pgscatalog_utils.download.GenomeBuild import GenomeBuild
+from pgscatalog_utils.scorefile.liftover import liftover, create_liftover
 
 
 def test_liftover(hg38_coords, hg19_coords, chain_files):
-    lifted = liftover(hg38_coords, chain_files, min_lift=0.9, target_build='GRCh37')
-    coords: pd.DataFrame = hg19_coords[['lifted_pos', 'lifted_chr']] == lifted[['lifted_pos', 'lifted_chr']]
-    assert coords.all(axis=None)
+    Config.chain_dir = chain_files
+    Config.lo = create_liftover()
+    Config.min_lift = 0.95
+
+    hg38 = copy.deepcopy(hg38_coords)
+    lifted = list(
+        liftover(
+            hg38,
+            harmonised=False,
+            current_build=GenomeBuild.GRCh38,
+            target_build=GenomeBuild.GRCh37,
+        )
+    )
+
+    assert [x["chr_position"] for x in lifted] == [
+        x["chr_position"] for x in hg19_coords
+    ]
+    assert [x["chr_name"] for x in lifted] == [x["chr_name"] for x in hg19_coords]
+
+    hg19 = copy.deepcopy(hg19_coords)
+    lift_back = list(
+        liftover(
+            hg19,
+            harmonised=False,
+            current_build=GenomeBuild.GRCh37,
+            target_build=GenomeBuild.GRCh38,
+        )
+    )
+    assert [x["chr_position"] for x in lift_back] == [
+        x["chr_position"] for x in hg38_coords
+    ]
+    assert [x["chr_name"] for x in lift_back] == [x["chr_name"] for x in hg38_coords]

From 032b1f70e067b280ee842fb5f9256dc8765d958a Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 3 Nov 2023 13:13:18 +0000
Subject: [PATCH 13/40] fix test

---
 conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conftest.py b/conftest.py
index ba3e065..d5d037c 100644
--- a/conftest.py
+++ b/conftest.py
@@ -25,7 +25,7 @@ def pgs_accessions():
 @pytest.fixture(scope="session")
 def mini_score_path(tmp_path_factory):
     path = importlib.resources.files(combine) / "PGS001229_22.txt"
-    return path
+    return str(path)
 
 
 @pytest.fixture(scope="session")

From 7864b7df689d6c9d995a55030f8c1ee68e880a50 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 3 Nov 2023 16:54:07 +0000
Subject: [PATCH 14/40] sqlite support and add log data

---
 .../scorefile/combine_scorefiles.py           |  16 +--
 pgscatalog_utils/scorefile/scoringfile.py     |  14 +-
 pgscatalog_utils/scorefile/write.py           | 130 +++++++++++++-----
 3 files changed, 114 insertions(+), 46 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index fbd3082..fef762a 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -4,7 +4,6 @@
 import pathlib
 import sys
 import textwrap
-import time
 
 from pgscatalog_utils.config import set_logging_level
 from pgscatalog_utils.download.GenomeBuild import GenomeBuild
@@ -34,7 +33,6 @@ def combine_scorefiles():
     paths: list[str] = list(set(args.scorefiles))  # unique paths only
     logger.debug(f"Input scorefiles: {paths}")
 
-    start_time = time.time()
     sfs = [ScoringFile.from_path(x) for x in paths]
 
     target_build = GenomeBuild.from_string(args.target_build)
@@ -46,20 +44,16 @@ def combine_scorefiles():
     else:
         logger.info(f"All builds match target build {target_build}")
 
-    line_counts: dict[str, int] = write_combined(sfs, args.outfile)
     # provide line counts when making the scoring files
-    log = []
-    for (k, v), sf in zip(line_counts.items(), sfs):
-        log.append(sf.generate_log(v))
+    logs: dict[str, int] = write_combined(sfs, args.outfile)
+    json_log = []
+    for (k, v), sf in zip(logs.items(), sfs):
+        json_log.append(sf.generate_log(v))
 
     log_out_path = pathlib.Path(args.outfile).parent / args.logfile
     with open(log_out_path, "w") as f:
         logger.info(f"Writing log to {f.name}")
-        json.dump(log, f, indent=4)
-
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    print(f"Elapsed time: {elapsed_time} seconds")
+        json.dump(json_log, f, indent=4)
 
 
 def _description_text() -> str:
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index 1b44c6c..f3eec11 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -77,7 +77,7 @@ def from_path(cls, path: pathlib.Path):
             accession=name,
         )
 
-    def generate_log(self, line_count: int):
+    def generate_log(self, counted: typing.Counter):
         log = {
             key: str(value) if value is not None else None
             for key, value in self.header.__dict__.items()
@@ -85,7 +85,15 @@ def generate_log(self, line_count: int):
 
         if log["variants_number"] is None:
             # custom scoring files might not have this information
-            log["variants_number"] = line_count + 1  # (0 indexed)
+            log["variants_number"] = counted["n_variants"] + 1  # (0 indexed)
+
+        if (
+            int(log["variants_number"]) != counted["n_variants"]
+            and not Config.drop_missing
+        ):
+            raise Exception(
+                f"Mismatch between variants_number and counted output {self.accession}"
+            )
 
         # multiple terms may be separated with a pipe
         if log["trait_mapped"]:
@@ -97,6 +105,7 @@ def generate_log(self, line_count: int):
         log["columns"] = self.fields
         log["use_liftover"] = Config.liftover
         log["use_harmonised"] = self.harmonised
+        log["sources"] = [k for k, v in counted.items() if k != "n_variants"]
 
         return {self.accession: log}
 
@@ -147,6 +156,7 @@ def read_rows(csv_reader, fields: list[str], name: str, row_nr: int, wide: bool)
                 "hm_chr",
                 "hm_pos",
                 "hm_inferOtherAllele",
+                "hm_source",
                 "is_dominant",
                 "is_recessive",
                 "accession",
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index d0a32bb..1cd8826 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -2,6 +2,10 @@
 import functools
 import gzip
 import logging
+import os
+import sqlite3
+import typing
+from collections import Counter
 from itertools import islice
 
 import pgzip
@@ -12,22 +16,10 @@
 logger = logging.getLogger(__name__)
 
 
-def write_combined(scoring_files: list[ScoringFile], out_path: str):
-    # compresslevel can be really slow, default is 9
-    if out_path.endswith("gz") and Config.threads == 1:
-        logger.info("Writing with gzip (slow)")
-        open_function = functools.partial(gzip.open, compresslevel=6)
-    elif Config.threads > 1:
-        logger.info("Writing with pgzip (fast)")
-        open_function = functools.partial(
-            pgzip.open, compresslevel=6, thread=Config.threads, blocksize=2 * 10**8
-        )
-    else:
-        logger.info("Writing text file (fast)")
-        open_function = open
-
-    with open_function(out_path, mode="wt") as f:
-        fieldnames = [
+class DataWriter:
+    def __init__(self, filename):
+        self.filename = filename
+        self.fieldnames = [
             "chr_name",
             "chr_position",
             "effect_allele",
@@ -37,21 +29,93 @@ def write_combined(scoring_files: list[ScoringFile], out_path: str):
             "accession",
             "row_nr",
         ]
-        writer = csv.DictWriter(
-            f, fieldnames=fieldnames, delimiter="\t", extrasaction="ignore"
+
+    def write(self, batch):
+        pass
+
+
+class TextFileWriter(DataWriter):
+    def __init__(self, compress, filename):
+        super().__init__(filename)
+        self.compress = compress
+
+    def write(self, batch):
+        if self.compress and Config.threads == 1:
+            logger.info("Writing with gzip (slow)")
+            open_function = functools.partial(gzip.open, compresslevel=6)
+        elif self.compress and Config.threads > 1:
+            logger.info("Writing with pgzip (fast)")
+            open_function = functools.partial(
+                pgzip.open, compresslevel=6, thread=Config.threads, blocksize=2 * 10**8
+            )
+        else:
+            logger.info("Writing text file (fast)")
+            open_function = open
+
+        mode = "at" if os.path.exists(self.filename) else "wt"
+        with open_function(self.filename, mode) as f:
+            writer = csv.DictWriter(
+                f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore"
+            )
+            if mode == "w":
+                writer.writeheader()
+            writer.writerows(batch)
+
+
+class SqliteWriter(DataWriter):
+    def __init__(self, filename):
+        super().__init__(filename)
+
+    def write(self, batch):
+        conn = sqlite3.connect(self.filename)
+        cursor = conn.cursor()
+        placeholders = ", ".join("?" for _ in self.fieldnames)
+
+        values = [
+            tuple(row[key] for key in self.fieldnames if key in row) for row in batch
+        ]
+
+        cursor.execute(
+            f"CREATE TABLE IF NOT EXISTS variants ({', '.join(self.fieldnames)})"
         )
-        writer.writeheader()
-
-        line_counts = {}
-        # write out in batches for compression efficiency and speed
-        for scoring_file in scoring_files:
-            logger.info(f"Writing {scoring_file.accession} variants")
-            while True:
-                batch = list(islice(scoring_file.variants, Config.batch_size))
-                if not batch:
-                    break
-                # calculate max row_nr now because it's when we finally generate variants
-                line_counts[scoring_file.accession] = max(x["row_nr"] for x in batch)
-                writer.writerows(batch)
-
-        return line_counts
+        cursor.executemany(f"INSERT INTO variants VALUES ({placeholders})", values)
+        conn.commit()
+        conn.close()
+
+
+def write_combined(
+    scoring_files: list[ScoringFile], out_path: str
+) -> dict[str : typing.Counter]:
+    # compresslevel can be really slow, default is 9
+    if out_path.endswith("gz"):
+        writer = TextFileWriter(compress=True, filename=out_path)
+    elif out_path.endswith("txt"):
+        writer = TextFileWriter(compress=False, filename=out_path)
+    elif out_path.endswith(".sqlite"):
+        writer = SqliteWriter(filename=out_path)
+    else:
+        raise Exception("Can't configure writer, please check out_path")
+
+    counts = []
+    log = {}
+    for scoring_file in scoring_files:
+        logger.info(f"Writing {scoring_file.accession} variants")
+        while True:
+            batch = list(islice(scoring_file.variants, Config.batch_size))
+            if not batch:
+                break
+            writer.write(batch=batch)
+            counts = calculate_log(batch, counts)
+
+        log[scoring_file.accession] = sum(counts, Counter())
+        counts = []
+
+    return log
+
+
+def calculate_log(batch, log: list[Counter]) -> list[Counter]:
+    # these statistics can only be generated while iterating through variants
+    n_variants = Counter("n_variants" for item in batch)
+    hm_source = Counter(item["hm_source"] for item in batch if "hm_source" in item)
+    log.extend([n_variants, hm_source])
+    return log

From afdc0b138c4ab6bedfec47fa77cac66c883c99be Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 6 Nov 2023 10:51:58 +0000
Subject: [PATCH 15/40] fix tests

---
 pgscatalog_utils/scorefile/qc.py          | 21 ++++++-------
 pgscatalog_utils/scorefile/scoringfile.py | 10 ++++--
 pgscatalog_utils/scorefile/write.py       |  2 +-
 tests/test_combine.py                     | 38 ++++++++---------------
 4 files changed, 30 insertions(+), 41 deletions(-)

diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 8454fe4..7db5da0 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -19,9 +19,7 @@ def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide:
     variants = assign_other_allele(variants)
 
     if wide:
-        # wide data must be sorted because:
-        # - check_duplicates requires sorted input
-        # - output would be unsorted, which looks a little bit messy
+        # wide data must be sorted because check_duplicates requires sorted input
         variants = (x for x in sorted(variants, key=lambda x: x["accession"]))
 
     variants = check_duplicates(variants)
@@ -89,10 +87,10 @@ def check_effect_weight(variants):
     for variant in variants:
         try:
             variant["effect_weight"] = float(variant["effect_weight"])
+            yield variant
         except ValueError:
             logger.critical(f"{variant} has bad effect weight")
             raise ValueError
-        yield variant
 
 
 def assign_other_allele(variants):
@@ -115,17 +113,16 @@ def assign_other_allele(variants):
 
 def assign_effect_type(variants):
     for variant in variants:
-        if "is_recessive" not in variant and "is_dominant" not in variant:
-            variant["effect_type"] = "additive"
-        else:
-            if variant["is_recessive"] == "TRUE":
-                variant["effect_type"] = "recessive"
-            elif variant["is_dominant"] == "TRUE":
+        match (variant.get("is_recessive"), variant.get("is_dominant")):
+            case (None, None) | ("FALSE", "FALSE"):
+                variant["effect_type"] = "additive"
+            case ("FALSE", "TRUE"):
                 variant["effect_type"] = "dominant"
-            elif variant["is_recessive"] == "TRUE" and variant["is_dominant"] == "TRUE":
+            case ("TRUE", "FALSE"):
+                variant["effect_type"] = "recessive"
+            case _:
                 logger.critical(f"Bad effect type setting: {variant}")
                 raise Exception
-
         yield variant
 
 
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index f3eec11..2b85e4c 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -6,9 +6,8 @@
 from dataclasses import dataclass
 from itertools import islice
 
-from pgscatalog_utils.scorefile.config import Config
-
 from pgscatalog_utils.download.GenomeBuild import GenomeBuild
+from pgscatalog_utils.scorefile.config import Config
 from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open
 from pgscatalog_utils.scorefile.qc import quality_control
 
@@ -85,7 +84,7 @@ def generate_log(self, counted: typing.Counter):
 
         if log["variants_number"] is None:
             # custom scoring files might not have this information
-            log["variants_number"] = counted["n_variants"] + 1  # (0 indexed)
+            log["variants_number"] = counted["n_variants"]
 
         if (
             int(log["variants_number"]) != counted["n_variants"]
@@ -171,6 +170,11 @@ def read_rows(csv_reader, fields: list[str], name: str, row_nr: int, wide: bool)
         row_nr += 1
 
 
+def parse_dict(variants):
+    # TODO: use best data types when parsing lines
+    pass
+
+
 def get_columns(path) -> tuple[int, list[str]]:
     open_function = auto_open(path)
     with open_function(path, mode="rt") as f:
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 1cd8826..54214fb 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -57,7 +57,7 @@ def write(self, batch):
             writer = csv.DictWriter(
                 f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore"
             )
-            if mode == "w":
+            if mode == "wt":
                 writer.writeheader()
             writer.writerows(batch)
 
diff --git a/tests/test_combine.py b/tests/test_combine.py
index edfa2c2..109ad6d 100644
--- a/tests/test_combine.py
+++ b/tests/test_combine.py
@@ -8,37 +8,25 @@
 from tests.data import combine
 
 
-def test_pgscatalog_combine(pgscatalog_path, tmp_path_factory, combine_output_header):
-    out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
+def test_pgscatalog_combine(pgscatalog_path, tmp_path, combine_output_header):
+    out_path = tmp_path / "combined.txt"
     args: list[str] = (
         ["combine_scorefiles", "-t", "GRCh37", "-s"]
         + [str(pgscatalog_path)]
         + ["-o", str(out_path.resolve())]
     )
-    with patch("sys.argv", args):
-        combine_scorefiles()
 
-    with open(out_path) as f:
-        for i, line in enumerate(f):
-            if i == 0:
-                cols = line.strip().split("\t")
-            else:
-                break
-        assert not set(cols).difference(set(combine_output_header))
+    # this mismatch occurs because header is from original PGS (~50,000)
+    # but variants are only from chr22 (~850)
+    with pytest.raises(Exception) as e:
+        with patch("sys.argv", args):
+            combine_scorefiles()
+            assert "Mismatch between variants_number and counted output" in str(e.value)
 
-    with open(out_path.parent / "log_combined.json") as f:
-        header = json.load(f)[0]
-        assert header["PGS001229_22"]["pgs_id"] == "PGS001229"
-        assert header["PGS001229_22"]["pgs_name"] == "GBE_INI50"
-        assert header["PGS001229_22"]["trait_mapped"] == ["body height"]
-        assert header["PGS001229_22"]["trait_efo"] == ["EFO_0004339"]
-        assert header["PGS001229_22"]["genome_build"] == "GRCh37"
-        assert not header["PGS001229_22"]["use_harmonised"]
 
-
-def test_effect_type_combine(effect_type_path, tmp_path_factory, combine_output_header):
+def test_effect_type_combine(effect_type_path, tmp_path, combine_output_header):
     # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception
-    out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
+    out_path = tmp_path / "combined.txt"
     args: list[str] = (
         ["combine_scorefiles", "-t", "GRCh37", "-s"]
         + [str(effect_type_path)]
@@ -74,9 +62,9 @@ def test_effect_type_combine(effect_type_path, tmp_path_factory, combine_output_
         assert not header["scorefile_dominant_and_recessive"]["use_harmonised"]
 
 
-def test_custom_combine(custom_score_path, tmp_path_factory, combine_output_header):
+def test_custom_combine(custom_score_path, tmp_path, combine_output_header):
     # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception
-    out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
+    out_path = tmp_path / "combined.txt"
     args: list[str] = (
         ["combine_scorefiles", "-t", "GRCh37", "-s"]
         + [str(custom_score_path)]
@@ -92,8 +80,8 @@ def test_custom_combine(custom_score_path, tmp_path_factory, combine_output_head
         for i, line in enumerate(f):
             if i == 0:
                 cols = line.strip().split("\t")
+                assert not set(cols).difference(set(combine_output_header))
             n += 1
-        assert not set(cols).difference(set(combine_output_header))
 
     with open(out_path.parent / "log_combined.json") as f:
         header = json.load(f)[0]

From 1302d7a27cba02a48e0c0d7a64a258fca3b0827b Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 6 Nov 2023 12:55:06 +0000
Subject: [PATCH 16/40] fix tests

---
 .gitignore                          |  3 ++-
 tests/data/combine/PGS001229_22.txt |  2 +-
 tests/test_combine.py               | 24 ++++++++++++++++++------
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/.gitignore b/.gitignore
index b0b6f3a..5ee9a36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -157,4 +157,5 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-.idea/
\ No newline at end of file
+.idea/
+.DS_Store
diff --git a/tests/data/combine/PGS001229_22.txt b/tests/data/combine/PGS001229_22.txt
index 5f791f4..4084c13 100644
--- a/tests/data/combine/PGS001229_22.txt
+++ b/tests/data/combine/PGS001229_22.txt
@@ -8,7 +8,7 @@
 #trait_efo=EFO_0004339
 #weight_type=NR
 #genome_build=GRCh37
-#variants_number=51209
+#variants_number=835
 ##SOURCE INFORMATION
 #pgp_id=PGP000244
 #citation=Tanigawa Y et al. medRxiv (2021). doi:10.1101/2021.09.02.21262942
diff --git a/tests/test_combine.py b/tests/test_combine.py
index 109ad6d..f87b5ca 100644
--- a/tests/test_combine.py
+++ b/tests/test_combine.py
@@ -16,12 +16,24 @@ def test_pgscatalog_combine(pgscatalog_path, tmp_path, combine_output_header):
         + ["-o", str(out_path.resolve())]
     )
 
-    # this mismatch occurs because header is from original PGS (~50,000)
-    # but variants are only from chr22 (~850)
-    with pytest.raises(Exception) as e:
-        with patch("sys.argv", args):
-            combine_scorefiles()
-            assert "Mismatch between variants_number and counted output" in str(e.value)
+    with patch("sys.argv", args):
+        combine_scorefiles()
+
+    n = -1  # skip header line
+    with open(out_path) as f:
+        for i, line in enumerate(f):
+            if i == 0:
+                cols = line.strip().split("\t")
+                assert not set(cols).difference(set(combine_output_header))
+            n += 1
+
+    with open(out_path.parent / "log_combined.json") as f:
+        header = json.load(f)[0]
+        assert header["PGS001229_22"]["pgs_id"] == "PGS001229"
+        assert header["PGS001229_22"]["pgs_name"] == "GBE_INI50"
+        assert header["PGS001229_22"]["genome_build"] == "GRCh37"
+        assert int(header["PGS001229_22"]["variants_number"]) == n
+        assert not header["PGS001229_22"]["use_harmonised"]
 
 
 def test_effect_type_combine(effect_type_path, tmp_path, combine_output_header):

From 953497ac8542c0156880075bd0070e37a1d30ebd Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 6 Nov 2023 16:03:10 +0000
Subject: [PATCH 17/40] fixes to make old and new output consistent

---
 .../scorefile/combine_scorefiles.py           |  3 +++
 pgscatalog_utils/scorefile/qc.py              |  2 +-
 pgscatalog_utils/scorefile/scoringfile.py     | 11 ++++-----
 pgscatalog_utils/scorefile/write.py           | 24 ++++++++++++-------
 4 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index fef762a..5757017 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -30,6 +30,9 @@ def combine_scorefiles():
         Config.chain_dir = args.chain_dir
         Config.lo = create_liftover()
 
+    if pathlib.Path(args.outfile).exists():
+        raise FileExistsError(f"{args.outfile}")
+
     paths: list[str] = list(set(args.scorefiles))  # unique paths only
     logger.debug(f"Input scorefiles: {paths}")
 
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 7db5da0..e881082 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -86,7 +86,7 @@ def drop_hla(variants):
 def check_effect_weight(variants):
     for variant in variants:
         try:
-            variant["effect_weight"] = float(variant["effect_weight"])
+            float(variant["effect_weight"])
             yield variant
         except ValueError:
             logger.critical(f"{variant} has bad effect weight")
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index 2b85e4c..2479801 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -111,9 +111,7 @@ def generate_log(self, counted: typing.Counter):
     @staticmethod
     def read_variants(path, fields, start_line, name: str, is_wide: bool):
         open_function = auto_open(path)
-        # row_nr and cum_batch are equivalent but
-        row_nr = 0  # important to increment in sub-generator for each line
-        cum_batch = 0  # sums batches in this function
+        row_nr = 0
 
         with open_function(path, mode="rt") as f:
             for _ in range(start_line + 1):
@@ -122,15 +120,16 @@ def read_variants(path, fields, start_line, name: str, is_wide: bool):
 
             while True:
                 batch = list(islice(f, Config.batch_size))
-                cum_batch += len(batch)
                 if not batch:
                     break
 
                 csv_reader = csv.reader(batch, delimiter="\t")
-                yield from read_rows(csv_reader, fields, name, row_nr, is_wide)
+                yield from read_rows(csv_reader, fields, name, is_wide, row_nr)
+                # this is important for row_nr resets for each batch
+                row_nr += len(batch)
 
 
-def read_rows(csv_reader, fields: list[str], name: str, row_nr: int, wide: bool):
+def read_rows(csv_reader, fields: list[str], name: str, wide: bool, row_nr: int):
     for row in csv_reader:
         variant = dict(zip(fields, row))
 
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 54214fb..1404560 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -26,9 +26,11 @@ def __init__(self, filename):
             "other_allele",
             "effect_weight",
             "effect_type",
+            "is_duplicated",
             "accession",
             "row_nr",
         ]
+        logger.info(f"Output filename: {filename}")
 
     def write(self, batch):
         pass
@@ -39,23 +41,27 @@ def __init__(self, compress, filename):
         super().__init__(filename)
         self.compress = compress
 
-    def write(self, batch):
         if self.compress and Config.threads == 1:
-            logger.info("Writing with gzip (slow)")
-            open_function = functools.partial(gzip.open, compresslevel=6)
+            logger.info("Writing with gzip")
+            self.open_function = functools.partial(gzip.open, compresslevel=6)
         elif self.compress and Config.threads > 1:
-            logger.info("Writing with pgzip (fast)")
-            open_function = functools.partial(
+            logger.info("Writing with pgzip")
+            self.open_function = functools.partial(
                 pgzip.open, compresslevel=6, thread=Config.threads, blocksize=2 * 10**8
             )
         else:
-            logger.info("Writing text file (fast)")
-            open_function = open
+            logger.info("Writing text file")
+            self.open_function = open
 
+    def write(self, batch):
         mode = "at" if os.path.exists(self.filename) else "wt"
-        with open_function(self.filename, mode) as f:
+        with self.open_function(self.filename, mode) as f:
             writer = csv.DictWriter(
-                f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore"
+                f,
+                fieldnames=self.fieldnames,
+                delimiter="\t",
+                extrasaction="ignore",
+                lineterminator="\n",
             )
             if mode == "wt":
                 writer.writeheader()

From cf3fc8bb9303e0af62bd75b9f5253b253494d34e Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 6 Nov 2023 16:41:04 +0000
Subject: [PATCH 18/40] update tests

---
 tests/test_combine.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/test_combine.py b/tests/test_combine.py
index f87b5ca..bc82faf 100644
--- a/tests/test_combine.py
+++ b/tests/test_combine.py
@@ -1,3 +1,4 @@
+import csv
 import importlib.resources
 import json
 from unittest.mock import patch
@@ -47,17 +48,16 @@ def test_effect_type_combine(effect_type_path, tmp_path, combine_output_header):
     with patch("sys.argv", args):
         combine_scorefiles()
 
-    n = -1  # skip header line
     with open(out_path) as f:
-        for i, line in enumerate(f):
-            if i == 0:
-                cols = line.strip().split("\t")
+        n = 0
+        for line in csv.DictReader(f, delimiter="\t"):
+            cols = list(line.keys())
 
-            if i == 1:
-                assert line.strip().split("\t")[-3] == "dominant"
+            if int(line["row_nr"]) == 0:
+                assert line["effect_type"] == "dominant"
 
-            if i == 2:
-                assert line.strip().split("\t")[-3] == "recessive"
+            if int(line["row_nr"]) == 1:
+                assert line["effect_type"] == "recessive"
 
             n += 1
 
@@ -130,6 +130,7 @@ def combine_output_header():
         "other_allele",
         "effect_weight",
         "effect_type",
+        "is_duplicated",
         "accession",
         "row_nr",
     ]

From d0fcb8da4d13589b6e956e5db976c6882d46e977 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 7 Nov 2023 11:43:13 +0000
Subject: [PATCH 19/40] drop parallel gzip and --threads

---
 pgscatalog_utils/scorefile/combine_scorefiles.py |  4 ----
 pgscatalog_utils/scorefile/config.py             |  1 -
 pgscatalog_utils/scorefile/header.py             | 15 ++-------------
 pgscatalog_utils/scorefile/write.py              |  9 +--------
 4 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 5757017..2532f8b 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -19,7 +19,6 @@ def combine_scorefiles():
     logger = logging.getLogger(__name__)
     set_logging_level(args.verbose)
 
-    Config.threads = args.threads
     Config.batch_size = 20000
     Config.drop_missing = args.drop_missing
     Config.target_build = GenomeBuild.from_string(args.target_build)
@@ -126,9 +125,6 @@ def _parse_args(args=None) -> argparse.Namespace:
         default=0.95,
         type=float,
     )
-    parser.add_argument(
-        "--threads", dest="threads", required=False, default=1, type=int
-    )
     parser.add_argument(
         "--drop_missing",
         dest="drop_missing",
diff --git a/pgscatalog_utils/scorefile/config.py b/pgscatalog_utils/scorefile/config.py
index 2725110..a7540fc 100644
--- a/pgscatalog_utils/scorefile/config.py
+++ b/pgscatalog_utils/scorefile/config.py
@@ -7,7 +7,6 @@
 
 @dataclass
 class Config:
-    threads: int
     drop_missing: bool
     liftover: bool
     lo: pyliftover.liftover
diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/header.py
index e9a03e4..82ea79d 100644
--- a/pgscatalog_utils/scorefile/header.py
+++ b/pgscatalog_utils/scorefile/header.py
@@ -1,11 +1,7 @@
-import functools
 import gzip
 import pathlib
 from dataclasses import dataclass
 
-from pgscatalog_utils.scorefile.config import Config
-from pgzip import pgzip
-
 from pgscatalog_utils.download.GenomeBuild import GenomeBuild
 
 
@@ -80,13 +76,6 @@ def _gen_header_lines(f):
 def auto_open(filepath):
     with open(filepath, "rb") as test_f:
         if test_f.read(2) == b"\x1f\x8b":
-            gzipped = True
+            return gzip.open
         else:
-            gzipped = False
-
-    if gzipped and Config.threads > 1:
-        return functools.partial(pgzip.open, thread=Config.threads)
-    elif gzipped:
-        return gzip.open
-    elif not gzipped:
-        return open
+            return open
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 1404560..57ceb91 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -8,8 +8,6 @@
 from collections import Counter
 from itertools import islice
 
-import pgzip
-
 from pgscatalog_utils.scorefile.config import Config
 from pgscatalog_utils.scorefile.scoringfile import ScoringFile
 
@@ -41,14 +39,9 @@ def __init__(self, compress, filename):
         super().__init__(filename)
         self.compress = compress
 
-        if self.compress and Config.threads == 1:
+        if self.compress:
             logger.info("Writing with gzip")
             self.open_function = functools.partial(gzip.open, compresslevel=6)
-        elif self.compress and Config.threads > 1:
-            logger.info("Writing with pgzip")
-            self.open_function = functools.partial(
-                pgzip.open, compresslevel=6, thread=Config.threads, blocksize=2 * 10**8
-            )
         else:
             logger.info("Writing text file")
             self.open_function = open

From e9e06e1dacf861ac7eeba64dd2b376b789a5e874 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 16 Nov 2023 16:30:26 +0000
Subject: [PATCH 20/40] create ScoreVariant and EffectType classes

---
 pgscatalog_utils/scorefile/effecttype.py   | 10 +++
 pgscatalog_utils/scorefile/scorevariant.py | 72 ++++++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 pgscatalog_utils/scorefile/effecttype.py
 create mode 100644 pgscatalog_utils/scorefile/scorevariant.py

diff --git a/pgscatalog_utils/scorefile/effecttype.py b/pgscatalog_utils/scorefile/effecttype.py
new file mode 100644
index 0000000..0d51f14
--- /dev/null
+++ b/pgscatalog_utils/scorefile/effecttype.py
@@ -0,0 +1,10 @@
+from enum import Enum
+
+
+class EffectType(Enum):
+    RECESSIVE = "recessive"
+    DOMINANT = "dominant"
+    ADDITIVE = "additive"
+
+    def __str__(self):
+        return str(self.value)
diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py
new file mode 100644
index 0000000..3b258a6
--- /dev/null
+++ b/pgscatalog_utils/scorefile/scorevariant.py
@@ -0,0 +1,72 @@
+"""
+This module contains the class ScoreVariant, which is a custom dictionary used to consistently represent rows in a PGS Catalog scoring file
+"""
+import collections
+
+from pgscatalog_utils.scorefile.effecttype import EffectType
+
+
+class ScoreVariant(collections.UserDict):
+    """A single variant from a scoring file structured to follow PGS Catalog standards,
+    typically extracted from a row in a scoring file.
+
+     See https://www.pgscatalog.org/downloads/#dl_scoring_files for field descriptions.
+
+     This class is intentionally simple (a dict that checks for mandatory keys and fills
+     optional keys) because a more complicated __init__ will be slow when lots of variants
+     are read from a file. dicts use fast C magic, so try not to interfere too much.
+
+     Some additional keys are included for quality control:
+     - accession: a unique identifier to group variants in the same score)
+     - row_nr: an incrementing integer, used to track the number of variants in an accession
+     - is_duplicated: a label to mark variants with the same coordinates and alleles
+     - effect_type: additive, recessive, or dominant
+
+     >>> variant = ScoreVariant(**{"chr_name": "1", "chr_position": 1, "effect_allele": "A", "other_allele": "G", "effect_weight": 0.5, "accession": "PGS000822", "row_nr": 0})
+     >>> variant
+     {'chr_name': '1', 'chr_position': 1, 'effect_allele': 'A', 'other_allele': 'G', 'effect_weight': 0.5, 'accession': 'PGS000822', 'row_nr': 0, 'rsID': None, 'hm_chr': None, 'hm_pos': None, 'hm_inferOtherAllele': None, 'hm_source': None, 'is_dominant': None, 'is_recessive': None, 'hm_rsID': None, 'hm_match_chr': None, 'hm_match_pos': None, 'is_duplicated': None, 'effect_type': <EffectType.ADDITIVE: 'additive'>}
+
+     Mandatory data fields match PGS Catalog harmonised data standards:
+
+    >>> ScoreVariant(**{"chr_name": "1", "chr_position": 1})
+    Traceback (most recent call last):
+        ...
+    ValueError: Mandatory field 'effect_allele' is missing.
+    """
+
+    mandatory_fields: tuple[str] = (
+        "chr_name",
+        "chr_position",
+        "effect_allele",
+        "effect_weight",
+        "accession",
+        "row_nr",
+    )
+    optional_fields: tuple[str] = (
+        "rsID",
+        "other_allele",
+        "hm_chr",
+        "hm_pos",
+        "hm_inferOtherAllele",
+        "hm_source",
+        "is_dominant",
+        "is_recessive",
+        "hm_rsID",
+        "hm_match_chr",
+        "hm_match_pos",
+        "is_duplicated",
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)  # creates the dict
+
+        for field in self.mandatory_fields:
+            if field not in self.data:
+                raise ValueError(f"Mandatory field '{field}' is missing.")
+
+        # set most optional fields to None...
+        for field in self.optional_fields:
+            self.data.setdefault(field, None)
+
+        # ... except effect type, as the vast majority of variants are additive
+        self.data.setdefault("effect_type", EffectType.ADDITIVE)

From eef2da6736cbc3126ecde102f7b85e7f11c0f3f7 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 16 Nov 2023 16:31:37 +0000
Subject: [PATCH 21/40] review comments

---
 pgscatalog_utils/scorefile/liftover.py    |  2 +
 pgscatalog_utils/scorefile/qc.py          | 97 ++++++++++++-----------
 pgscatalog_utils/scorefile/scoringfile.py | 45 +++--------
 3 files changed, 65 insertions(+), 79 deletions(-)

diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py
index 8097b70..255fcf2 100644
--- a/pgscatalog_utils/scorefile/liftover.py
+++ b/pgscatalog_utils/scorefile/liftover.py
@@ -46,6 +46,8 @@ def liftover(
                 yield variant
                 n_lifted += 1
             else:
+                variant["chr_name"] = None
+                variant["chr_position"] = None
                 variant["lifted"] = False
                 yield variant
             n += 1
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index e881082..d98acee 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -2,6 +2,7 @@
 import typing
 
 from pgscatalog_utils.scorefile.config import Config
+from pgscatalog_utils.scorefile.effecttype import EffectType
 from pgscatalog_utils.scorefile.header import ScoringFileHeader
 from pgscatalog_utils.scorefile.liftover import liftover
 
@@ -9,7 +10,23 @@
 
 
 def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide: bool):
+    # order is important for:
+    # 1. liftover non-harmonised data (quite rare), failed lifts get None'd
+    # 2. remap harmonised data, failed harmonisations get None'd
+    # 3. check and optionally drop bad variants
+    # where a bad variant has None in a mandatory ScoreVariant field
+    # then continue with other QC
+
+    if Config.liftover:
+        variants = liftover(
+            variants,
+            harmonised=harmonised,
+            current_build=header.genome_build,
+            target_build=Config.target_build,
+        )
+
     variants = remap_harmonised(variants, harmonised)
+    variants = check_bad_variant(variants)
 
     if Config.drop_missing:
         variants = drop_hla(variants)
@@ -24,14 +41,6 @@ def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide:
 
     variants = check_duplicates(variants)
 
-    if Config.liftover:
-        variants = liftover(
-            variants,
-            harmonised=harmonised,
-            current_build=header.genome_build,
-            target_build=Config.target_build,
-        )
-
     return variants
 
 
@@ -75,10 +84,12 @@ def check_duplicates(variants):
 def drop_hla(variants):
     n_dropped = 0
     for variant in variants:
-        if variant["effect_allele"] != "P" or variant["effect_allele"] != "N":
-            yield variant
-        else:
-            n_dropped += 1
+        match variant:
+            case {"effect_allele": "P"} | {"effect_allele": "N"}:
+                n_dropped += 1
+                continue
+            case _:
+                yield variant
 
     logger.warning(f"{n_dropped} HLA alleles detected and dropped")
 
@@ -96,12 +107,8 @@ def check_effect_weight(variants):
 def assign_other_allele(variants):
     n_dropped = 0
     for variant in variants:
-        if "other_allele" in variant:
-            if "/" in variant["other_allele"]:
-                # drop multiple other alleles
-                n_dropped += 1
-                variant["other_allele"] = None
-        else:
+        if "/" in variant["other_allele"]:
+            n_dropped += 1
             variant["other_allele"] = None
 
         yield variant
@@ -115,11 +122,11 @@ def assign_effect_type(variants):
     for variant in variants:
         match (variant.get("is_recessive"), variant.get("is_dominant")):
             case (None, None) | ("FALSE", "FALSE"):
-                variant["effect_type"] = "additive"
+                pass  # default value is additive
             case ("FALSE", "TRUE"):
-                variant["effect_type"] = "dominant"
+                variant["effect_type"] = EffectType.DOMINANT
             case ("TRUE", "FALSE"):
-                variant["effect_type"] = "recessive"
+                variant["effect_type"] = EffectType.RECESSIVE
             case _:
                 logger.critical(f"Bad effect type setting: {variant}")
                 raise Exception
@@ -127,37 +134,33 @@ def assign_effect_type(variants):
 
 
 def remap_harmonised(variants, harmonised: bool):
-    n_bad = 0
     if harmonised:
         for variant in variants:
-            if variant["hm_chr"]:
-                variant["chr_name"] = variant["hm_chr"]
-
-            if variant["hm_pos"]:
-                variant["chr_position"] = variant["hm_pos"]
-
-            if "hm_inferOtherAllele" in variant and variant.get("other_allele") is None:
+            # using the harmonised field in the header to make sure we don't accidentally overwrite
+            # positions with empty data (e.g. in an unharmonised file)
+            # if harmonisation has failed we _always_ want to use that information
+            variant["chr_name"] = variant["hm_chr"]
+            variant["chr_position"] = variant["hm_pos"]
+            if variant["other_allele"] is None:
                 variant["other_allele"] = variant["hm_inferOtherAllele"]
-
-            if (
-                "chr_name" in variant
-                and "chr_position" in variant
-                and "effect_weight" in variant
-            ):
-                yield variant
-            elif Config.drop_missing:
-                continue
-                # (don't yield anything, filtering out missing variants)
-            else:
-                # assume a bad harmonisation with no genomic coordinates
-                # these will get labelled as duplicates eventually (probably)
-                variant["chr_name"] = None
-                variant["chr_position"] = None
-                yield variant
-                n_bad += 1
+            yield variant
     else:
         for variant in variants:
+            # can't remap, so don't try
             yield variant
 
+
+def check_bad_variant(variants):
+    n_bad = 0
+    for variant in variants:
+        match variant:
+            case {"chr_name": None} | {"chr_position": None} | {"effect_allele": None}:
+                # (effect weight checked separately)
+                n_bad += 1
+                if not Config.drop_missing:
+                    yield variant
+            case _:
+                yield variant
+
     if n_bad > 1:
-        logger.warning(f"{n_bad} variants failed harmonisation")
+        logger.warning(f"{n_bad} bad variants")
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index 2479801..e18ae00 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -10,6 +10,7 @@
 from pgscatalog_utils.scorefile.config import Config
 from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open
 from pgscatalog_utils.scorefile.qc import quality_control
+from pgscatalog_utils.scorefile.scorevariant import ScoreVariant
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -129,7 +130,9 @@ def read_variants(path, fields, start_line, name: str, is_wide: bool):
                 row_nr += len(batch)
 
 
-def read_rows(csv_reader, fields: list[str], name: str, wide: bool, row_nr: int):
+def read_rows(
+    csv_reader, fields: list[str], name: str, wide: bool, row_nr: int
+) -> typing.Generator[ScoreVariant, None, None]:
     for row in csv_reader:
         variant = dict(zip(fields, row))
 
@@ -138,42 +141,20 @@ def read_rows(csv_reader, fields: list[str], name: str, wide: bool, row_nr: int)
                 i for i, x in enumerate(["effect_weight_" in x for x in fields]) if x
             ]
             for i, weight_name in zip(ew_col_idxs, [fields[i] for i in ew_col_idxs]):
-                keys = ["chr_name", "chr_position", "effect_allele", "other_allele"]
-                yield {k: variant[k] for k in keys if k in variant} | {
-                    "accession": weight_name,
-                    "row_nr": row_nr,
-                    "effect_weight": variant[weight_name],
-                }
+                yield ScoreVariant(
+                    **variant,
+                    **{
+                        "accession": weight_name,
+                        "row_nr": row_nr,
+                        "effect_weight": variant[weight_name],
+                    },
+                )
         else:
-            keys = [
-                "chr_name",
-                "chr_position",
-                "effect_allele",
-                "other_allele",
-                "effect_weight",
-                "hm_chr",
-                "hm_pos",
-                "hm_inferOtherAllele",
-                "hm_source",
-                "is_dominant",
-                "is_recessive",
-                "accession",
-                "row_nr",
-            ]
-
-            yield {k: variant[k] for k in keys if k in variant} | {
-                "accession": name,
-                "row_nr": row_nr,
-            }
+            yield ScoreVariant(**variant, **{"accession": name, "row_nr": row_nr})
 
         row_nr += 1
 
 
-def parse_dict(variants):
-    # TODO: use best data types when parsing lines
-    pass
-
-
 def get_columns(path) -> tuple[int, list[str]]:
     open_function = auto_open(path)
     with open_function(path, mode="rt") as f:

From ee16684aa0f508bb76c9d3f1af39019e2699e608 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 16 Nov 2023 16:48:50 +0000
Subject: [PATCH 22/40] add type hints

---
 pgscatalog_utils/scorefile/liftover.py    |  9 ++++--
 pgscatalog_utils/scorefile/qc.py          | 38 +++++++++++++++++------
 pgscatalog_utils/scorefile/scoringfile.py | 17 +++++-----
 3 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py
index 255fcf2..7ccd5a0 100644
--- a/pgscatalog_utils/scorefile/liftover.py
+++ b/pgscatalog_utils/scorefile/liftover.py
@@ -1,17 +1,22 @@
 import logging
 import os
+import typing
 
 import pyliftover
 
 from pgscatalog_utils.download.GenomeBuild import GenomeBuild
 from pgscatalog_utils.scorefile.config import Config
+from pgscatalog_utils.scorefile.scorevariant import ScoreVariant
 
 logger = logging.getLogger(__name__)
 
 
 def liftover(
-    variants, harmonised: bool, current_build: GenomeBuild, target_build: GenomeBuild
-):
+    variants: typing.Generator[ScoreVariant, None, None],
+    harmonised: bool,
+    current_build: GenomeBuild,
+    target_build: GenomeBuild,
+) -> typing.Generator[ScoreVariant, None, None]:
     if harmonised:
         skip_lo = True
     elif target_build == current_build:
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index d98acee..75f50ba 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -5,11 +5,17 @@
 from pgscatalog_utils.scorefile.effecttype import EffectType
 from pgscatalog_utils.scorefile.header import ScoringFileHeader
 from pgscatalog_utils.scorefile.liftover import liftover
+from pgscatalog_utils.scorefile.scorevariant import ScoreVariant
 
 logger = logging.getLogger(__name__)
 
 
-def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide: bool):
+def quality_control(
+    variants: typing.Generator[ScoreVariant, None, None],
+    header: ScoringFileHeader,
+    harmonised: bool,
+    wide: bool,
+) -> typing.Generator[ScoreVariant, None, None]:
     # order is important for:
     # 1. liftover non-harmonised data (quite rare), failed lifts get None'd
     # 2. remap harmonised data, failed harmonisations get None'd
@@ -44,7 +50,9 @@ def quality_control(variants, header: ScoringFileHeader, harmonised: bool, wide:
     return variants
 
 
-def check_duplicates(variants):
+def check_duplicates(
+    variants: typing.Generator[ScoreVariant, None, None]
+) -> typing.Generator[ScoreVariant, None, None]:
     seen_ids: dict = {}
     current_accession: typing.Union[str, None] = None
     n_duplicates: int = 0
@@ -81,7 +89,9 @@ def check_duplicates(variants):
         )
 
 
-def drop_hla(variants):
+def drop_hla(
+    variants: typing.Generator[ScoreVariant, None, None]
+) -> typing.Generator[ScoreVariant, None, None]:
     n_dropped = 0
     for variant in variants:
         match variant:
@@ -94,7 +104,9 @@ def drop_hla(variants):
     logger.warning(f"{n_dropped} HLA alleles detected and dropped")
 
 
-def check_effect_weight(variants):
+def check_effect_weight(
+    variants: typing.Generator[ScoreVariant, None, None]
+) -> typing.Generator[ScoreVariant, None, None]:
     for variant in variants:
         try:
             float(variant["effect_weight"])
@@ -104,7 +116,9 @@ def check_effect_weight(variants):
             raise ValueError
 
 
-def assign_other_allele(variants):
+def assign_other_allele(
+    variants: typing.Generator[ScoreVariant, None, None]
+) -> typing.Generator[ScoreVariant, None, None]:
     n_dropped = 0
     for variant in variants:
         if "/" in variant["other_allele"]:
@@ -118,11 +132,13 @@ def assign_other_allele(variants):
         logger.warning("Other allele for these variants is set to missing")
 
 
-def assign_effect_type(variants):
+def assign_effect_type(
+    variants: typing.Generator[ScoreVariant, None, None]
+) -> typing.Generator[ScoreVariant, None, None]:
     for variant in variants:
         match (variant.get("is_recessive"), variant.get("is_dominant")):
             case (None, None) | ("FALSE", "FALSE"):
-                pass  # default value is additive
+                pass  # default value is additive, pass to break match and yield
             case ("FALSE", "TRUE"):
                 variant["effect_type"] = EffectType.DOMINANT
             case ("TRUE", "FALSE"):
@@ -133,7 +149,9 @@ def assign_effect_type(variants):
         yield variant
 
 
-def remap_harmonised(variants, harmonised: bool):
+def remap_harmonised(
+    variants: typing.Generator[ScoreVariant, None, None], harmonised: bool
+) -> typing.Generator[ScoreVariant, None, None]:
     if harmonised:
         for variant in variants:
             # using the harmonised field in the header to make sure we don't accidentally overwrite
@@ -150,7 +168,9 @@ def remap_harmonised(variants, harmonised: bool):
             yield variant
 
 
-def check_bad_variant(variants):
+def check_bad_variant(
+    variants: typing.Generator[ScoreVariant, None, None]
+) -> typing.Generator[ScoreVariant, None, None]:
     n_bad = 0
     for variant in variants:
         match variant:
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index e18ae00..976b969 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -12,7 +12,6 @@
 from pgscatalog_utils.scorefile.qc import quality_control
 from pgscatalog_utils.scorefile.scorevariant import ScoreVariant
 
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -24,7 +23,7 @@ class ScoringFile:
     genome_build: typing.Union[GenomeBuild, None]
     harmonised: bool
     fields: list[str]
-    variants: typing.Generator
+    variants: typing.Generator[ScoreVariant, None, None]
 
     def __post_init__(self):
         if self.header.HmPOS_build:
@@ -57,13 +56,15 @@ def from_path(cls, path: pathlib.Path):
         is_wide = detect_wide(cols)
 
         logger.info(f"Lazily reading variants from {path}")
-        variants = ScoringFile.read_variants(
+        variants: typing.Generator[
+            ScoreVariant, None, None
+        ] = ScoringFile.read_variants(
             path=path, start_line=start_line, fields=cols, name=name, is_wide=is_wide
         )
 
-        # note: these generator expressions aren't doing a bunch of iterations
+        # note: the qc generators aren't doing a bunch of nested iterations
         # it's just a data processing pipeline
-        variants = quality_control(
+        variants: typing.Generator[ScoreVariant, None, None] = quality_control(
             variants, header=header, harmonised=harmonised, wide=is_wide
         )
 
@@ -110,7 +111,9 @@ def generate_log(self, counted: typing.Counter):
         return {self.accession: log}
 
     @staticmethod
-    def read_variants(path, fields, start_line, name: str, is_wide: bool):
+    def read_variants(
+        path, fields, start_line, name: str, is_wide: bool
+    ) -> typing.Generator[ScoreVariant, None, None]:
         open_function = auto_open(path)
         row_nr = 0
 
@@ -126,7 +129,7 @@ def read_variants(path, fields, start_line, name: str, is_wide: bool):
 
                 csv_reader = csv.reader(batch, delimiter="\t")
                 yield from read_rows(csv_reader, fields, name, is_wide, row_nr)
-                # this is important for row_nr resets for each batch
+                # this is important because row_nr resets for each batch
                 row_nr += len(batch)
 
 

From 6be1dd3571c37d71e3910486aca1b1c9ced1b492 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 16 Nov 2023 17:05:37 +0000
Subject: [PATCH 23/40] remove coordinates from mandatory fields

---
 pgscatalog_utils/scorefile/scorevariant.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py
index 3b258a6..5bbc307 100644
--- a/pgscatalog_utils/scorefile/scorevariant.py
+++ b/pgscatalog_utils/scorefile/scorevariant.py
@@ -35,14 +35,14 @@ class ScoreVariant(collections.UserDict):
     """
 
     mandatory_fields: tuple[str] = (
-        "chr_name",
-        "chr_position",
         "effect_allele",
         "effect_weight",
         "accession",
         "row_nr",
     )
     optional_fields: tuple[str] = (
+        "chr_name",
+        "chr_position",
         "rsID",
         "other_allele",
         "hm_chr",
@@ -64,6 +64,20 @@ def __init__(self, **kwargs):
             if field not in self.data:
                 raise ValueError(f"Mandatory field '{field}' is missing.")
 
+        # note on coordinates / rsID not being mandatory
+        # ----------------------------------------------
+        # according to PGS Catalog scoring file standards:
+        #   - rsID is mandatory if genomic coordinates are missing
+        #   - genomic coordinates are mandatory if rsIDs are missing
+        # however I want to keep __init__ as simple (and fast) as possible
+        # millions of ScoreVariants may be instantiated
+        # so don't check, just initialise to None if missing
+
+        # practically speaking:
+        # 1) harmonised files may be missing coordinates, but have hm columns which we then use
+        # 2) we loudly warn about variants that are missing coordinates
+        # 3) custom scorefiles are expected to supply coordinates
+
         # set most optional fields to None...
         for field in self.optional_fields:
             self.data.setdefault(field, None)

From ea16f4e8f74fb5dd6f25100d97ae1192188cab92 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 20 Nov 2023 10:04:57 +0000
Subject: [PATCH 24/40] fix old scoring files

---
 pgscatalog_utils/download/GenomeBuild.py               | 10 +++++++---
 pgscatalog_utils/scorefile/qc.py                       |  2 +-
 pgscatalog_utils/scorefile/scoringfile.py              |  2 +-
 .../scorefile/{header.py => scoringfileheader.py}      |  2 +-
 4 files changed, 10 insertions(+), 6 deletions(-)
 rename pgscatalog_utils/scorefile/{header.py => scoringfileheader.py} (96%)

diff --git a/pgscatalog_utils/download/GenomeBuild.py b/pgscatalog_utils/download/GenomeBuild.py
index 23c8984..69fd8ab 100644
--- a/pgscatalog_utils/download/GenomeBuild.py
+++ b/pgscatalog_utils/download/GenomeBuild.py
@@ -4,6 +4,8 @@
 class GenomeBuild(Enum):
     GRCh37 = "GRCh37"
     GRCh38 = "GRCh38"
+    # just included to handle older files, incompatible unless harmonised:
+    NCBI36 = "NCBI36"  # ew
 
     def __str__(self):
         return str(self.value)
@@ -11,11 +13,13 @@ def __str__(self):
     @classmethod
     def from_string(cls, build):
         match build:
-            case "GRCh37" | "hg18":
+            case "GRCh37" | "hg19":
                 return cls(GenomeBuild.GRCh37)
-            case "GRCh38" | "hg19":
+            case "GRCh38" | "hg38":
                 return cls(GenomeBuild.GRCh38)
             case "NR":
                 return None
+            case "NCBI36" | "hg18":
+                return cls(GenomeBuild.NCBI36)
             case _:
-                raise Exception
+                raise Exception(f"Can't match {build=}")
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 75f50ba..5282bdd 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -3,7 +3,7 @@
 
 from pgscatalog_utils.scorefile.config import Config
 from pgscatalog_utils.scorefile.effecttype import EffectType
-from pgscatalog_utils.scorefile.header import ScoringFileHeader
+from pgscatalog_utils.scorefile.scoringfileheader import ScoringFileHeader
 from pgscatalog_utils.scorefile.liftover import liftover
 from pgscatalog_utils.scorefile.scorevariant import ScoreVariant
 
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index 976b969..95dff01 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -8,7 +8,7 @@
 
 from pgscatalog_utils.download.GenomeBuild import GenomeBuild
 from pgscatalog_utils.scorefile.config import Config
-from pgscatalog_utils.scorefile.header import ScoringFileHeader, auto_open
+from pgscatalog_utils.scorefile.scoringfileheader import ScoringFileHeader, auto_open
 from pgscatalog_utils.scorefile.qc import quality_control
 from pgscatalog_utils.scorefile.scorevariant import ScoreVariant
 
diff --git a/pgscatalog_utils/scorefile/header.py b/pgscatalog_utils/scorefile/scoringfileheader.py
similarity index 96%
rename from pgscatalog_utils/scorefile/header.py
rename to pgscatalog_utils/scorefile/scoringfileheader.py
index 82ea79d..07cf663 100644
--- a/pgscatalog_utils/scorefile/header.py
+++ b/pgscatalog_utils/scorefile/scoringfileheader.py
@@ -45,7 +45,7 @@ def from_path(cls, path: pathlib.Path):
             return ScoringFileHeader(**header_dict)
         else:
             # no header available
-            raise Exception("No header detected in scoring file")
+            raise Exception(f"No header detected in scoring file {path=}")
 
 
 def raw_header_to_dict(header):

From 7c4c84843d98d1d38a611e3b8b395b9b6fa74b88 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 20 Nov 2023 17:59:43 +0000
Subject: [PATCH 25/40] check effect alleles and compelx scoring files

---
 pgscatalog_utils/scorefile/effectallele.py | 12 ++++++++++
 pgscatalog_utils/scorefile/qc.py           | 18 +++++++++++++++
 pgscatalog_utils/scorefile/scoringfile.py  |  3 ++-
 pgscatalog_utils/scorefile/write.py        | 26 ++++++++++++++++++++--
 4 files changed, 56 insertions(+), 3 deletions(-)
 create mode 100644 pgscatalog_utils/scorefile/effectallele.py

diff --git a/pgscatalog_utils/scorefile/effectallele.py b/pgscatalog_utils/scorefile/effectallele.py
new file mode 100644
index 0000000..92dafd9
--- /dev/null
+++ b/pgscatalog_utils/scorefile/effectallele.py
@@ -0,0 +1,12 @@
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class EffectAllele:
+    # (class attribute, so shared)
+    _valid_bases = frozenset({"A", "C", "T", "G"})
+
+    @classmethod
+    def is_valid(cls, effect_allele: str) -> bool:
+        return not frozenset(effect_allele) - cls._valid_bases
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 5282bdd..d014aa0 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -1,6 +1,8 @@
 import logging
 import typing
 
+from pgscatalog_utils.scorefile.effectallele import EffectAllele
+
 from pgscatalog_utils.scorefile.config import Config
 from pgscatalog_utils.scorefile.effecttype import EffectType
 from pgscatalog_utils.scorefile.scoringfileheader import ScoringFileHeader
@@ -22,6 +24,7 @@ def quality_control(
     # 3. check and optionally drop bad variants
     # where a bad variant has None in a mandatory ScoreVariant field
     # then continue with other QC
+    logger.info(f"Starting quality control checks for {header.pgs_id=}")
 
     if Config.liftover:
         variants = liftover(
@@ -40,6 +43,7 @@ def quality_control(
     variants = assign_effect_type(variants)
     variants = check_effect_weight(variants)
     variants = assign_other_allele(variants)
+    variants = check_effect_allele(variants)
 
     if wide:
         # wide data must be sorted because check_duplicates requires sorted input
@@ -184,3 +188,17 @@ def check_bad_variant(
 
     if n_bad > 1:
         logger.warning(f"{n_bad} bad variants")
+
+
+def check_effect_allele(
+    variants: typing.Generator[ScoreVariant, None, None]
+) -> typing.Generator[ScoreVariant, None, None]:
+    n_bad = 0
+    for variant in variants:
+        if not EffectAllele.is_valid(variant["effect_allele"]):
+            n_bad += 1
+
+        yield variant
+
+    if n_bad > 1:
+        logger.warning(f"{n_bad} variants have invalid effect alleles (not ACTG)")
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index 95dff01..15267d5 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -91,9 +91,10 @@ def generate_log(self, counted: typing.Counter):
         if (
             int(log["variants_number"]) != counted["n_variants"]
             and not Config.drop_missing
+            and counted.get("complex", 0) == 0
         ):
             raise Exception(
-                f"Mismatch between variants_number and counted output {self.accession}"
+                f"Mismatch between header ({log['variants_number']}) and counted output ({counted['n_variants']}) for {self.accession}"
             )
 
         # multiple terms may be separated with a pipe
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 57ceb91..db8fd31 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -9,6 +9,8 @@
 from itertools import islice
 
 from pgscatalog_utils.scorefile.config import Config
+from pgscatalog_utils.scorefile.effectallele import EffectAllele
+from pgscatalog_utils.scorefile.scorevariant import ScoreVariant
 from pgscatalog_utils.scorefile.scoringfile import ScoringFile
 
 logger = logging.getLogger(__name__)
@@ -112,9 +114,29 @@ def write_combined(
     return log
 
 
-def calculate_log(batch, log: list[Counter]) -> list[Counter]:
+def calculate_log(batch: list[ScoreVariant], log: list[Counter]) -> list[Counter]:
     # these statistics can only be generated while iterating through variants
     n_variants = Counter("n_variants" for item in batch)
+    complex_scorefile = Counter(detect_complex(batch))
     hm_source = Counter(item["hm_source"] for item in batch if "hm_source" in item)
-    log.extend([n_variants, hm_source])
+    log.extend([n_variants + hm_source + complex_scorefile])
     return log
+
+
+def detect_complex(batch: list[ScoreVariant]) -> typing.Generator[str, None, None]:
+    """Some older scoring files in the PGS Catalog are complicated
+    We agreed to skip some checks on these odd files and just reproduce them faithfully
+    They often require bespoke set up to support interaction terms, etc
+    """
+    complex_keys = {"is_haplotype", "is_diplotype", "is_interaction"}
+
+    for key in complex_keys:
+        for variant in batch:
+            if not EffectAllele.is_valid(variant["effect_allele"]):
+                yield "complex"
+
+            if variant.get(key, False) == "True":
+                # explicitly check string value with == because
+                # a scoring file with a column with all false values is valid
+                # (i.e. don't just check key presence)
+                yield "complex"

From 51aa0f219a10b90569552e6784750d78b47ce31a Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 21 Nov 2023 11:48:57 +0000
Subject: [PATCH 26/40] don't access __annotations__ directly

---
 pgscatalog_utils/scorefile/scoringfileheader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pgscatalog_utils/scorefile/scoringfileheader.py b/pgscatalog_utils/scorefile/scoringfileheader.py
index 07cf663..a06622a 100644
--- a/pgscatalog_utils/scorefile/scoringfileheader.py
+++ b/pgscatalog_utils/scorefile/scoringfileheader.py
@@ -1,4 +1,5 @@
 import gzip
+import inspect
 import pathlib
 from dataclasses import dataclass
 
@@ -33,7 +34,7 @@ def __post_init__(self):
     def from_path(cls, path: pathlib.Path):
         raw_header: dict = raw_header_to_dict(read_header(path))
         # only keep keys needed by class but support partial headers with None values
-        keep_keys = ScoringFileHeader.__annotations__.keys()
+        keep_keys = inspect.get_annotations(ScoringFileHeader).keys()
         header_dict = {k: raw_header.get(k) for k in keep_keys}
         # ... so we can unpack the dict into a dataclass
 

From 9938bbdfaa052644c24057e764c96652a6453ad2 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 21 Nov 2023 11:49:09 +0000
Subject: [PATCH 27/40] remove logger

---
 pgscatalog_utils/scorefile/effectallele.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pgscatalog_utils/scorefile/effectallele.py b/pgscatalog_utils/scorefile/effectallele.py
index 92dafd9..14412a3 100644
--- a/pgscatalog_utils/scorefile/effectallele.py
+++ b/pgscatalog_utils/scorefile/effectallele.py
@@ -1,8 +1,3 @@
-import logging
-
-logger = logging.getLogger(__name__)
-
-
 class EffectAllele:
     # (class attribute, so shared)
     _valid_bases = frozenset({"A", "C", "T", "G"})

From af1eef223c0440e00d2e69aa21cdbcd546a8546a Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 22 Nov 2023 11:27:28 +0000
Subject: [PATCH 28/40] warn about complex files and variant mismatch

---
 pgscatalog_utils/scorefile/qc.py          | 25 +++++++++++++++++++++--
 pgscatalog_utils/scorefile/scoringfile.py |  8 +++++---
 pgscatalog_utils/scorefile/write.py       | 23 +--------------------
 3 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index d014aa0..a99ed40 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -24,8 +24,6 @@ def quality_control(
     # 3. check and optionally drop bad variants
     # where a bad variant has None in a mandatory ScoreVariant field
     # then continue with other QC
-    logger.info(f"Starting quality control checks for {header.pgs_id=}")
-
     if Config.liftover:
         variants = liftover(
             variants,
@@ -44,6 +42,7 @@ def quality_control(
     variants = check_effect_weight(variants)
     variants = assign_other_allele(variants)
     variants = check_effect_allele(variants)
+    variants = detect_complex(variants)
 
     if wide:
         # wide data must be sorted because check_duplicates requires sorted input
@@ -202,3 +201,25 @@ def check_effect_allele(
 
     if n_bad > 1:
         logger.warning(f"{n_bad} variants have invalid effect alleles (not ACTG)")
+
+
+def detect_complex(
+    variants: typing.Generator[ScoreVariant, None, None]
+) -> typing.Generator[ScoreVariant, None, None]:
+    """Some older scoring files in the PGS Catalog are complicated.
+    They often require bespoke set up to support interaction terms, etc
+    """
+    complex_keys = {"is_haplotype", "is_diplotype", "is_interaction"}
+    is_complex = False
+
+    for variant in variants:
+        if not is_complex:
+            is_complex = any(key in variant for key in complex_keys)
+
+        yield variant
+
+    if is_complex:
+        logger.warning("Complex scoring file detected")
+        logger.warning(
+            "Complex files are difficult to calculate properly and may require manual intervention"
+        )
diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py
index 15267d5..ca02b28 100644
--- a/pgscatalog_utils/scorefile/scoringfile.py
+++ b/pgscatalog_utils/scorefile/scoringfile.py
@@ -91,10 +91,12 @@ def generate_log(self, counted: typing.Counter):
         if (
             int(log["variants_number"]) != counted["n_variants"]
             and not Config.drop_missing
-            and counted.get("complex", 0) == 0
         ):
-            raise Exception(
-                f"Mismatch between header ({log['variants_number']}) and counted output ({counted['n_variants']}) for {self.accession}"
+            logger.warning(
+                f"Mismatch between header ({log['variants_number']}) and output row count ({counted['n_variants']}) for {self.accession}"
+            )
+            logger.warning(
+                "This can happen with older scoring files in the PGS Catalog (e.g. PGS000028)"
             )
 
         # multiple terms may be separated with a pipe
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index db8fd31..efe158f 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -9,7 +9,6 @@
 from itertools import islice
 
 from pgscatalog_utils.scorefile.config import Config
-from pgscatalog_utils.scorefile.effectallele import EffectAllele
 from pgscatalog_utils.scorefile.scorevariant import ScoreVariant
 from pgscatalog_utils.scorefile.scoringfile import ScoringFile
 
@@ -117,26 +116,6 @@ def write_combined(
 def calculate_log(batch: list[ScoreVariant], log: list[Counter]) -> list[Counter]:
     # these statistics can only be generated while iterating through variants
     n_variants = Counter("n_variants" for item in batch)
-    complex_scorefile = Counter(detect_complex(batch))
     hm_source = Counter(item["hm_source"] for item in batch if "hm_source" in item)
-    log.extend([n_variants + hm_source + complex_scorefile])
+    log.extend([n_variants + hm_source])
     return log
-
-
-def detect_complex(batch: list[ScoreVariant]) -> typing.Generator[str, None, None]:
-    """Some older scoring files in the PGS Catalog are complicated
-    We agreed to skip some checks on these odd files and just reproduce them faithfully
-    They often require bespoke set up to support interaction terms, etc
-    """
-    complex_keys = {"is_haplotype", "is_diplotype", "is_interaction"}
-
-    for key in complex_keys:
-        for variant in batch:
-            if not EffectAllele.is_valid(variant["effect_allele"]):
-                yield "complex"
-
-            if variant.get(key, False) == "True":
-                # explicitly check string value with == because
-                # a scoring file with a column with all false values is valid
-                # (i.e. don't just check key presence)
-                yield "complex"

From 54fd6eab2ac6d86dc72524a88bcc6cd7a59bb37b Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 6 Dec 2023 11:56:45 +0000
Subject: [PATCH 29/40] refactor scorevariant from userdict to class with
 __slots__

---
 pgscatalog_utils/scorefile/effectallele.py |  17 ++-
 pgscatalog_utils/scorefile/effecttype.py   |   4 +
 pgscatalog_utils/scorefile/qc.py           |  47 +++----
 pgscatalog_utils/scorefile/scorevariant.py | 150 +++++++++++++--------
 pgscatalog_utils/scorefile/write.py        |   9 +-
 5 files changed, 137 insertions(+), 90 deletions(-)

diff --git a/pgscatalog_utils/scorefile/effectallele.py b/pgscatalog_utils/scorefile/effectallele.py
index 14412a3..6f0dfcb 100644
--- a/pgscatalog_utils/scorefile/effectallele.py
+++ b/pgscatalog_utils/scorefile/effectallele.py
@@ -1,7 +1,16 @@
 class EffectAllele:
-    # (class attribute, so shared)
     _valid_bases = frozenset({"A", "C", "T", "G"})
+    __slots__ = ("allele", "is_valid")
 
-    @classmethod
-    def is_valid(cls, effect_allele: str) -> bool:
-        return not frozenset(effect_allele) - cls._valid_bases
+    def __init__(self, allele: str):
+        self.allele = allele
+        self.is_valid = self.is_valid_allele()
+
+    def __repr__(self):
+        return f'{type(self).__name__}("{self.allele}")'
+
+    def __str__(self):
+        return self.allele
+
+    def is_valid_allele(self) -> bool:
+        return not frozenset(self.allele) - self._valid_bases
diff --git a/pgscatalog_utils/scorefile/effecttype.py b/pgscatalog_utils/scorefile/effecttype.py
index 0d51f14..4878072 100644
--- a/pgscatalog_utils/scorefile/effecttype.py
+++ b/pgscatalog_utils/scorefile/effecttype.py
@@ -8,3 +8,7 @@ class EffectType(Enum):
 
     def __str__(self):
         return str(self.value)
+
+    def __repr__(self):
+        # pasting __repr__ output should be sufficient to construct the class
+        return f"{type(self).__name__}.{self.name}"
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index a99ed40..50fcb52 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -1,7 +1,6 @@
 import logging
 import typing
 
-from pgscatalog_utils.scorefile.effectallele import EffectAllele
 
 from pgscatalog_utils.scorefile.config import Config
 from pgscatalog_utils.scorefile.effecttype import EffectType
@@ -61,27 +60,25 @@ def check_duplicates(
     n_duplicates: int = 0
     n_variants: int = 0
     for variant in variants:
-        accession: str = variant["accession"]
+        accession: str = variant.accession
 
         if accession != current_accession:
             seen_ids = {}
             current_accession = accession
 
         # None other allele -> empty string
-        id: str = ":".join(
+        variant_id: str = ":".join(
             [
-                str(variant[k] or "")
+                str(getattr(variant, k) or "")
                 for k in ["chr_name", "chr_position", "effect_allele", "other_allele"]
             ]
         )
 
-        if id in seen_ids:
-            variant["is_duplicated"] = True
+        if variant_id in seen_ids:
+            variant.is_duplicated = True
             n_duplicates += 1
-        else:
-            variant["is_duplicated"] = False
 
-        seen_ids[id] = True
+        seen_ids[variant_id] = True
 
         yield variant
         n_variants += 1
@@ -112,7 +109,7 @@ def check_effect_weight(
 ) -> typing.Generator[ScoreVariant, None, None]:
     for variant in variants:
         try:
-            float(variant["effect_weight"])
+            float(variant.effect_weight)
             yield variant
         except ValueError:
             logger.critical(f"{variant} has bad effect weight")
@@ -124,9 +121,9 @@ def assign_other_allele(
 ) -> typing.Generator[ScoreVariant, None, None]:
     n_dropped = 0
     for variant in variants:
-        if "/" in variant["other_allele"]:
+        if "/" in variant.other_allele:
             n_dropped += 1
-            variant["other_allele"] = None
+            variant.other_allele = None
 
         yield variant
 
@@ -139,13 +136,13 @@ def assign_effect_type(
     variants: typing.Generator[ScoreVariant, None, None]
 ) -> typing.Generator[ScoreVariant, None, None]:
     for variant in variants:
-        match (variant.get("is_recessive"), variant.get("is_dominant")):
+        match (variant.is_recessive, variant.is_dominant):
             case (None, None) | ("FALSE", "FALSE"):
                 pass  # default value is additive, pass to break match and yield
             case ("FALSE", "TRUE"):
-                variant["effect_type"] = EffectType.DOMINANT
+                variant.effect_type = EffectType.DOMINANT
             case ("TRUE", "FALSE"):
-                variant["effect_type"] = EffectType.RECESSIVE
+                variant.effect_type = EffectType.RECESSIVE
             case _:
                 logger.critical(f"Bad effect type setting: {variant}")
                 raise Exception
@@ -160,10 +157,10 @@ def remap_harmonised(
             # using the harmonised field in the header to make sure we don't accidentally overwrite
             # positions with empty data (e.g. in an unharmonised file)
             # if harmonisation has failed we _always_ want to use that information
-            variant["chr_name"] = variant["hm_chr"]
-            variant["chr_position"] = variant["hm_pos"]
-            if variant["other_allele"] is None:
-                variant["other_allele"] = variant["hm_inferOtherAllele"]
+            variant.chr_name = variant.hm_chr
+            variant.chr_position = variant.hm_pos
+            if variant.other_allele is None:
+                variant.other_allele = variant.hm_inferOtherAllele
             yield variant
     else:
         for variant in variants:
@@ -177,7 +174,11 @@ def check_bad_variant(
     n_bad = 0
     for variant in variants:
         match variant:
-            case {"chr_name": None} | {"chr_position": None} | {"effect_allele": None}:
+            case (
+                ScoreVariant(chr_name=None)
+                | ScoreVariant(chr_position=None)
+                | ScoreVariant(effect_allele=None)
+            ):
                 # (effect weight checked separately)
                 n_bad += 1
                 if not Config.drop_missing:
@@ -194,7 +195,7 @@ def check_effect_allele(
 ) -> typing.Generator[ScoreVariant, None, None]:
     n_bad = 0
     for variant in variants:
-        if not EffectAllele.is_valid(variant["effect_allele"]):
+        if not variant.effect_allele.is_valid:
             n_bad += 1
 
         yield variant
@@ -209,12 +210,12 @@ def detect_complex(
     """Some older scoring files in the PGS Catalog are complicated.
     They often require bespoke set up to support interaction terms, etc
     """
-    complex_keys = {"is_haplotype", "is_diplotype", "is_interaction"}
     is_complex = False
 
     for variant in variants:
         if not is_complex:
-            is_complex = any(key in variant for key in complex_keys)
+            if variant.is_complex:
+                is_complex = True
 
         yield variant
 
diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py
index 5bbc307..094c6d9 100644
--- a/pgscatalog_utils/scorefile/scorevariant.py
+++ b/pgscatalog_utils/scorefile/scorevariant.py
@@ -1,39 +1,8 @@
-"""
-This module contains the class ScoreVariant, which is a custom dictionary used to consistently represent rows in a PGS Catalog scoring file
-"""
-import collections
-
+from pgscatalog_utils.scorefile.effectallele import EffectAllele
 from pgscatalog_utils.scorefile.effecttype import EffectType
 
 
-class ScoreVariant(collections.UserDict):
-    """A single variant from a scoring file structured to follow PGS Catalog standards,
-    typically extracted from a row in a scoring file.
-
-     See https://www.pgscatalog.org/downloads/#dl_scoring_files for field descriptions.
-
-     This class is intentionally simple (a dict that checks for mandatory keys and fills
-     optional keys) because a more complicated __init__ will be slow when lots of variants
-     are read from a file. dicts use fast C magic, so try not to interfere too much.
-
-     Some additional keys are included for quality control:
-     - accession: a unique identifier to group variants in the same score)
-     - row_nr: an incrementing integer, used to track the number of variants in an accession
-     - is_duplicated: a label to mark variants with the same coordinates and alleles
-     - effect_type: additive, recessive, or dominant
-
-     >>> variant = ScoreVariant(**{"chr_name": "1", "chr_position": 1, "effect_allele": "A", "other_allele": "G", "effect_weight": 0.5, "accession": "PGS000822", "row_nr": 0})
-     >>> variant
-     {'chr_name': '1', 'chr_position': 1, 'effect_allele': 'A', 'other_allele': 'G', 'effect_weight': 0.5, 'accession': 'PGS000822', 'row_nr': 0, 'rsID': None, 'hm_chr': None, 'hm_pos': None, 'hm_inferOtherAllele': None, 'hm_source': None, 'is_dominant': None, 'is_recessive': None, 'hm_rsID': None, 'hm_match_chr': None, 'hm_match_pos': None, 'is_duplicated': None, 'effect_type': <EffectType.ADDITIVE: 'additive'>}
-
-     Mandatory data fields match PGS Catalog harmonised data standards:
-
-    >>> ScoreVariant(**{"chr_name": "1", "chr_position": 1})
-    Traceback (most recent call last):
-        ...
-    ValueError: Mandatory field 'effect_allele' is missing.
-    """
-
+class ScoreVariant:
     mandatory_fields: tuple[str] = (
         "effect_allele",
         "effect_weight",
@@ -55,32 +24,97 @@ class ScoreVariant(collections.UserDict):
         "hm_match_chr",
         "hm_match_pos",
         "is_duplicated",
+        "effect_type",
     )
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)  # creates the dict
-
-        for field in self.mandatory_fields:
-            if field not in self.data:
-                raise ValueError(f"Mandatory field '{field}' is missing.")
-
-        # note on coordinates / rsID not being mandatory
-        # ----------------------------------------------
-        # according to PGS Catalog scoring file standards:
-        #   - rsID is mandatory if genomic coordinates are missing
-        #   - genomic coordinates are mandatory if rsIDs are missing
-        # however I want to keep __init__ as simple (and fast) as possible
-        # millions of ScoreVariants may be instantiated
-        # so don't check, just initialise to None if missing
+    complex_fields: tuple[str] = ("is_haplotype", "is_diplotype", "is_interaction")
 
-        # practically speaking:
-        # 1) harmonised files may be missing coordinates, but have hm columns which we then use
-        # 2) we loudly warn about variants that are missing coordinates
-        # 3) custom scorefiles are expected to supply coordinates
-
-        # set most optional fields to None...
-        for field in self.optional_fields:
-            self.data.setdefault(field, None)
+    # column names for output are used by __iter__ and when writing out
+    output_fields: tuple[str] = (
+        "chr_name",
+        "chr_position",
+        "effect_allele",
+        "other_allele",
+        "effect_weight",
+        "effect_type",
+        "is_duplicated",
+        "accession",
+        "row_nr",
+    )
 
-        # ... except effect type, as the vast majority of variants are additive
-        self.data.setdefault("effect_type", EffectType.ADDITIVE)
+    # slots uses magic to improve speed and memory when making millions of objects
+    __slots__ = mandatory_fields + optional_fields + ("is_complex",)
+
+    # __init__ is intentionally verbose and avoids using loops or trickery to work:
+    #   - attributes won't change often
+    #   - class accepts keyword parameters only to init (not positional)
+    #   - type hints are helpful in parameters
+    #   - setting sensible defaults for optional fields is clear
+    #   - being verbose helps prevent IDE warnings
+    # extra kwargs are silently ignored
+    # (yes, effect_weight is treated as a str, want to avoid rounding errors at this stage)
+    def __init__(
+        self,
+        *,
+        effect_allele: str,
+        effect_weight: str,
+        accession: str,
+        row_nr: int,
+        chr_name: str = None,
+        chr_position: int = None,
+        rsID: str = None,
+        other_allele: str = None,
+        hm_chr: str = None,
+        hm_pos: int = None,
+        hm_inferOtherAllele: str = None,
+        hm_source: str = None,
+        is_dominant: str = None,
+        is_recessive: str = None,
+        hm_rsID: str = None,
+        hm_match_chr: str = None,
+        hm_match_pos: str = None,
+        is_duplicated: bool = False,
+        effect_type: EffectType = EffectType.ADDITIVE,
+        is_complex: bool = False,
+        **kwargs,
+    ):
+        # start with mandatory attributes
+        self.effect_allele: EffectAllele = EffectAllele(effect_allele)
+        self.effect_weight: str = effect_weight
+        self.accession = accession
+        self.row_nr = row_nr
+
+        # now set optional fields
+        self.chr_name = chr_name
+        self.chr_position = chr_position
+        self.rsID = rsID
+        self.other_allele = other_allele
+        self.hm_chr = hm_chr
+        self.hm_pos = hm_pos
+        self.hm_inferOtherAllele = hm_inferOtherAllele
+        self.hm_source = hm_source
+        self.is_dominant = is_dominant
+        self.is_recessive = is_recessive
+        self.hm_rsID = hm_rsID
+        self.hm_match_chr = hm_match_chr
+        self.hm_match_pos = hm_match_pos
+        self.is_duplicated = is_duplicated
+        self.effect_type = effect_type
+
+        # these fields are important to check if variants are complex
+        if any([x in kwargs for x in self.complex_fields]):
+            is_complex = True
+        self.is_complex = is_complex
+
+    def __repr__(self):
+        class_name = type(self).__name__
+        values = {}
+
+        for key in ScoreVariant.__slots__:
+            values[key] = getattr(self, key, None)
+
+        return f"{class_name}({values})"
+
+    def __iter__(self):
+        for attr in self.output_fields:
+            yield getattr(self, attr)
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index efe158f..9424fbc 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -50,15 +50,14 @@ def __init__(self, compress, filename):
     def write(self, batch):
         mode = "at" if os.path.exists(self.filename) else "wt"
         with self.open_function(self.filename, mode) as f:
-            writer = csv.DictWriter(
+            writer = csv.writer(
                 f,
-                fieldnames=self.fieldnames,
                 delimiter="\t",
-                extrasaction="ignore",
                 lineterminator="\n",
             )
             if mode == "wt":
-                writer.writeheader()
+                writer.writerow(ScoreVariant.output_fields)
+
             writer.writerows(batch)
 
 
@@ -116,6 +115,6 @@ def write_combined(
 def calculate_log(batch: list[ScoreVariant], log: list[Counter]) -> list[Counter]:
     # these statistics can only be generated while iterating through variants
     n_variants = Counter("n_variants" for item in batch)
-    hm_source = Counter(item["hm_source"] for item in batch if "hm_source" in item)
+    hm_source = Counter(getattr(item, "hm_source") for item in batch)
     log.extend([n_variants + hm_source])
     return log

From 42a580fbc3815de3de8eef2f22a16e188e332b22 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 7 Dec 2023 17:19:23 +0000
Subject: [PATCH 30/40] fix __repr__ and type hints

---
 pgscatalog_utils/scorefile/scorevariant.py | 45 ++++++++++++----------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py
index 094c6d9..0d367df 100644
--- a/pgscatalog_utils/scorefile/scorevariant.py
+++ b/pgscatalog_utils/scorefile/scorevariant.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 from pgscatalog_utils.scorefile.effectallele import EffectAllele
 from pgscatalog_utils.scorefile.effecttype import EffectType
 
@@ -26,7 +28,6 @@ class ScoreVariant:
         "is_duplicated",
         "effect_type",
     )
-
     complex_fields: tuple[str] = ("is_haplotype", "is_diplotype", "is_interaction")
 
     # column names for output are used by __iter__ and when writing out
@@ -81,30 +82,30 @@ def __init__(
         # start with mandatory attributes
         self.effect_allele: EffectAllele = EffectAllele(effect_allele)
         self.effect_weight: str = effect_weight
-        self.accession = accession
-        self.row_nr = row_nr
+        self.accession: str = accession
+        self.row_nr: int = int(row_nr)
 
         # now set optional fields
-        self.chr_name = chr_name
-        self.chr_position = chr_position
-        self.rsID = rsID
-        self.other_allele = other_allele
-        self.hm_chr = hm_chr
-        self.hm_pos = hm_pos
-        self.hm_inferOtherAllele = hm_inferOtherAllele
-        self.hm_source = hm_source
-        self.is_dominant = is_dominant
-        self.is_recessive = is_recessive
-        self.hm_rsID = hm_rsID
-        self.hm_match_chr = hm_match_chr
-        self.hm_match_pos = hm_match_pos
-        self.is_duplicated = is_duplicated
-        self.effect_type = effect_type
+        self.chr_name: Optional[str] = chr_name
+        self.chr_position: Optional[str] = chr_position
+        self.rsID: Optional[str] = rsID
+        self.other_allele: Optional[str] = other_allele
+        self.hm_chr: Optional[str] = hm_chr
+        self.hm_pos: Optional[int] = hm_pos
+        self.hm_inferOtherAllele: Optional[str] = hm_inferOtherAllele
+        self.hm_source: Optional[str] = hm_source
+        self.is_dominant: Optional[bool] = is_dominant
+        self.is_recessive: Optional[bool] = is_recessive
+        self.hm_rsID: Optional[str] = hm_rsID
+        self.hm_match_chr: Optional[str] = hm_match_chr
+        self.hm_match_pos: Optional[str] = hm_match_pos
+        self.is_duplicated: Optional[bool] = is_duplicated
+        self.effect_type: EffectType = effect_type
 
         # these fields are important to check if variants are complex
         if any([x in kwargs for x in self.complex_fields]):
             is_complex = True
-        self.is_complex = is_complex
+        self.is_complex: bool = is_complex
 
     def __repr__(self):
         class_name = type(self).__name__
@@ -113,7 +114,11 @@ def __repr__(self):
         for key in ScoreVariant.__slots__:
             values[key] = getattr(self, key, None)
 
-        return f"{class_name}({values})"
+        # extract str parameter for effect allele
+        values["effect_allele"] = values["effect_allele"].allele
+
+        params = ",".join([f"{k}={repr(v)}" for k, v in values.items()])
+        return f"{class_name}({params})"
 
     def __iter__(self):
         for attr in self.output_fields:

From 3c0444597259a19e91e708c4a185d3d5c2e2883b Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 11 Dec 2023 15:31:38 +0000
Subject: [PATCH 31/40] add pyarrow support

---
 .../scorefile/combine_scorefiles.py           |  2 +-
 pgscatalog_utils/scorefile/scorevariant.py    | 16 +++-
 pgscatalog_utils/scorefile/write.py           | 79 +++++++++++++++++--
 poetry.lock                                   | 50 +++++++++++-
 pyproject.toml                                |  1 +
 5 files changed, 136 insertions(+), 12 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 2532f8b..bce5565 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -19,7 +19,7 @@ def combine_scorefiles():
     logger = logging.getLogger(__name__)
     set_logging_level(args.verbose)
 
-    Config.batch_size = 20000
+    Config.batch_size = 100000
     Config.drop_missing = args.drop_missing
     Config.target_build = GenomeBuild.from_string(args.target_build)
     Config.liftover = args.liftover
diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py
index 0d367df..38135dc 100644
--- a/pgscatalog_utils/scorefile/scorevariant.py
+++ b/pgscatalog_utils/scorefile/scorevariant.py
@@ -87,11 +87,23 @@ def __init__(
 
         # now set optional fields
         self.chr_name: Optional[str] = chr_name
-        self.chr_position: Optional[str] = chr_position
+
+        # casting to int is important for arrow export
+        try:
+            self.chr_position: Optional[int] = int(chr_position)
+        except (ValueError, TypeError):
+            self.chr_position = None
+
         self.rsID: Optional[str] = rsID
         self.other_allele: Optional[str] = other_allele
         self.hm_chr: Optional[str] = hm_chr
-        self.hm_pos: Optional[int] = hm_pos
+
+        # casting to int is important when harmonised data may replace chr_position
+        try:
+            self.hm_pos: Optional[int] = int(hm_pos)
+        except (ValueError, TypeError):
+            self.hm_pos = None
+
         self.hm_inferOtherAllele: Optional[str] = hm_inferOtherAllele
         self.hm_source: Optional[str] = hm_source
         self.is_dominant: Optional[bool] = is_dominant
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 9424fbc..1e43594 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -12,6 +12,13 @@
 from pgscatalog_utils.scorefile.scorevariant import ScoreVariant
 from pgscatalog_utils.scorefile.scoringfile import ScoringFile
 
+try:
+    import pyarrow as pa
+
+    PYARROW_AVAILABLE = True
+except ImportError:
+    PYARROW_AVAILABLE = False
+
 logger = logging.getLogger(__name__)
 
 
@@ -82,18 +89,74 @@ def write(self, batch):
         conn.close()
 
 
+class PyarrowWriter(DataWriter):
+    if PYARROW_AVAILABLE:
+        schema = pa.schema(
+            [
+                pa.field("chr_name", pa.string()),
+                pa.field("chr_position", pa.uint64()),
+                pa.field("effect_allele", pa.string()),
+                pa.field("other_allele", pa.string()),
+                pa.field("effect_weight", pa.string()),
+                pa.field("effect_type", pa.string()),
+                pa.field("is_duplicated", pa.bool_()),
+                pa.field("accession", pa.string()),
+                pa.field("row_nr", pa.uint64()),
+            ]
+        )
+
+    def __init__(self, filename):
+        if not PYARROW_AVAILABLE:
+            # TODO: provide a pip command
+            raise ImportError(
+                "pyarrow output not available, please install pyarrow as listed in the pyproject.toml extras section"
+            )
+        super().__init__(filename)
+
+        self._sink = pa.OSFile(self.filename, "wb")
+        self._writer: pa.RecordBatchFileWriter = pa.ipc.new_file(
+            self._sink, self.schema
+        )
+
+    def write(self, batch: list[ScoreVariant]):
+        batch_dict = {
+            "chr_name": [x.chr_name for x in batch],
+            "chr_position": [x.chr_position for x in batch],
+            "effect_allele": [str(x.effect_allele) for x in batch],
+            "other_allele": [x.other_allele for x in batch],
+            "effect_weight": [x.effect_weight for x in batch],
+            "effect_type": [str(x.effect_type) for x in batch],
+            "is_duplicated": [x.is_duplicated for x in batch],
+            "accession": [x.accession for x in batch],
+            "row_nr": [x.row_nr for x in batch],
+        }
+
+        record_batch = pa.RecordBatch.from_pydict(batch_dict, schema=self.schema)
+        self._writer.write(record_batch)
+
+    def __del__(self):
+        # it's very important to close the writer and file, or it gets corrupted
+        # can't use a with statement, so close when the object gets deleted
+        self._writer.close()
+        if not self._sink.closed:
+            self._sink.close()
+
+
 def write_combined(
     scoring_files: list[ScoringFile], out_path: str
 ) -> dict[str : typing.Counter]:
     # compresslevel can be really slow, default is 9
-    if out_path.endswith("gz"):
-        writer = TextFileWriter(compress=True, filename=out_path)
-    elif out_path.endswith("txt"):
-        writer = TextFileWriter(compress=False, filename=out_path)
-    elif out_path.endswith(".sqlite"):
-        writer = SqliteWriter(filename=out_path)
-    else:
-        raise Exception("Can't configure writer, please check out_path")
+    match fn := out_path.lower():
+        case _ if fn.endswith("gz"):
+            writer = TextFileWriter(compress=True, filename=out_path)
+        case _ if fn.endswith("txt"):
+            writer = TextFileWriter(compress=False, filename=out_path)
+        case _ if fn.endswith("sqlite"):
+            writer = SqliteWriter(filename=out_path)
+        case _ if fn.endswith("ipc"):
+            writer = PyarrowWriter(filename=out_path)
+        case _:
+            raise ValueError(f"Unsupported file extension: {out_path}")
 
     counts = []
     log = {}
diff --git a/poetry.lock b/poetry.lock
index 4c1fda4..05b2c77 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2245,6 +2245,54 @@ files = [
 [package.extras]
 tests = ["pytest"]
 
+[[package]]
+name = "pyarrow"
+version = "14.0.1"
+description = "Python library for Apache Arrow"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:96d64e5ba7dceb519a955e5eeb5c9adcfd63f73a56aea4722e2cc81364fc567a"},
+    {file = "pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a8ae88c0038d1bc362a682320112ee6774f006134cd5afc291591ee4bc06505"},
+    {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f6f053cb66dc24091f5511e5920e45c83107f954a21032feadc7b9e3a8e7851"},
+    {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:906b0dc25f2be12e95975722f1e60e162437023f490dbd80d0deb7375baf3171"},
+    {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:78d4a77a46a7de9388b653af1c4ce539350726cd9af62e0831e4f2bd0c95a2f4"},
+    {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:06ca79080ef89d6529bb8e5074d4b4f6086143b2520494fcb7cf8a99079cde93"},
+    {file = "pyarrow-14.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:32542164d905002c42dff896efdac79b3bdd7291b1b74aa292fac8450d0e4dcd"},
+    {file = "pyarrow-14.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:c7331b4ed3401b7ee56f22c980608cf273f0380f77d0f73dd3c185f78f5a6220"},
+    {file = "pyarrow-14.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:922e8b49b88da8633d6cac0e1b5a690311b6758d6f5d7c2be71acb0f1e14cd61"},
+    {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58c889851ca33f992ea916b48b8540735055201b177cb0dcf0596a495a667b00"},
+    {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30d8494870d9916bb53b2a4384948491444741cb9a38253c590e21f836b01222"},
+    {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:be28e1a07f20391bb0b15ea03dcac3aade29fc773c5eb4bee2838e9b2cdde0cb"},
+    {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:981670b4ce0110d8dcb3246410a4aabf5714db5d8ea63b15686bce1c914b1f83"},
+    {file = "pyarrow-14.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:4756a2b373a28f6166c42711240643fb8bd6322467e9aacabd26b488fa41ec23"},
+    {file = "pyarrow-14.0.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:cf87e2cec65dd5cf1aa4aba918d523ef56ef95597b545bbaad01e6433851aa10"},
+    {file = "pyarrow-14.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:470ae0194fbfdfbf4a6b65b4f9e0f6e1fa0ea5b90c1ee6b65b38aecee53508c8"},
+    {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6263cffd0c3721c1e348062997babdf0151301f7353010c9c9a8ed47448f82ab"},
+    {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8089d7e77d1455d529dbd7cff08898bbb2666ee48bc4085203af1d826a33cc"},
+    {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fada8396bc739d958d0b81d291cfd201126ed5e7913cb73de6bc606befc30226"},
+    {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:2a145dab9ed7849fc1101bf03bcdc69913547f10513fdf70fc3ab6c0a50c7eee"},
+    {file = "pyarrow-14.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:05fe7994745b634c5fb16ce5717e39a1ac1fac3e2b0795232841660aa76647cd"},
+    {file = "pyarrow-14.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:a8eeef015ae69d104c4c3117a6011e7e3ecd1abec79dc87fd2fac6e442f666ee"},
+    {file = "pyarrow-14.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c76807540989fe8fcd02285dd15e4f2a3da0b09d27781abec3adc265ddbeba1"},
+    {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:450e4605e3c20e558485f9161a79280a61c55efe585d51513c014de9ae8d393f"},
+    {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323cbe60210173ffd7db78bfd50b80bdd792c4c9daca8843ef3cd70b186649db"},
+    {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0140c7e2b740e08c5a459439d87acd26b747fc408bde0a8806096ee0baaa0c15"},
+    {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:e592e482edd9f1ab32f18cd6a716c45b2c0f2403dc2af782f4e9674952e6dd27"},
+    {file = "pyarrow-14.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d264ad13605b61959f2ae7c1d25b1a5b8505b112715c961418c8396433f213ad"},
+    {file = "pyarrow-14.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:01e44de9749cddc486169cb632f3c99962318e9dacac7778315a110f4bf8a450"},
+    {file = "pyarrow-14.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0351fecf0e26e152542bc164c22ea2a8e8c682726fce160ce4d459ea802d69c"},
+    {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c1f6110c386464fd2e5e4ea3624466055bbe681ff185fd6c9daa98f30a3f9a"},
+    {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11e045dfa09855b6d3e7705a37c42e2dc2c71d608fab34d3c23df2e02df9aec3"},
+    {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:097828b55321897db0e1dbfc606e3ff8101ae5725673498cbfa7754ee0da80e4"},
+    {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1daab52050a1c48506c029e6fa0944a7b2436334d7e44221c16f6f1b2cc9c510"},
+    {file = "pyarrow-14.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3f6d5faf4f1b0d5a7f97be987cf9e9f8cd39902611e818fe134588ee99bf0283"},
+    {file = "pyarrow-14.0.1.tar.gz", hash = "sha256:b8b3f4fe8d4ec15e1ef9b599b94683c5216adaed78d5cb4c606180546d1e2ee1"},
+]
+
+[package.dependencies]
+numpy = ">=1.16.6"
+
 [[package]]
 name = "pycparser"
 version = "2.21"
@@ -3261,4 +3309,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "b9985d182b0c350a39e12aeae274f2e809d1454f47b58b2d2a5fe8b8264418b7"
+content-hash = "397df0f3e64b00fabebb36bf3c3576d94c2f34c2f34dcec223973a19e525d2e6"
diff --git a/pyproject.toml b/pyproject.toml
index 15a3b9b..0ea7b13 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,7 @@ zstandard = "^0.18.0"
 pgzip = "^0.3.2"
 scikit-learn = "^1.2.1"
 pre-commit = "^3.5.0"
+pyarrow = "^14.0.1"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"

From 980940f83366fed49e26f944d39a941b7872b36d Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 13 Dec 2023 11:23:10 +0000
Subject: [PATCH 32/40] add license data to log

---
 pgscatalog_utils/scorefile/scoringfileheader.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pgscatalog_utils/scorefile/scoringfileheader.py b/pgscatalog_utils/scorefile/scoringfileheader.py
index a06622a..06d7f10 100644
--- a/pgscatalog_utils/scorefile/scoringfileheader.py
+++ b/pgscatalog_utils/scorefile/scoringfileheader.py
@@ -21,6 +21,11 @@ class ScoringFileHeader:
     HmPOS_build: GenomeBuild
     HmPOS_date: str
     format_version: str
+    license: str = (
+        "PGS obtained from the Catalog should be cited appropriately, and "
+        "used in accordance with any licensing restrictions set by the authors. See EBI "
+        "Terms of Use (https://www.ebi.ac.uk/about/terms-of-use/) for additional details."
+    )
 
     def __post_init__(self):
         if self.variants_number:
@@ -38,6 +43,11 @@ def from_path(cls, path: pathlib.Path):
         header_dict = {k: raw_header.get(k) for k in keep_keys}
         # ... so we can unpack the dict into a dataclass
 
+        if header_dict.get("license") is None:
+            # missing license data in header means default license
+            # (this may change in the future)
+            header_dict["license"] = cls.license
+
         if "HmPOS_build" not in header_dict:
             # working with pgs catalog formatted header but unharmonised data
             header_dict["HmPOS_build"] = None

From 7d39e0859eff484ead5c595173b204d43d7fcdd2 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 14 Dec 2023 10:30:03 +0000
Subject: [PATCH 33/40] add custom exceptions

---
 pgscatalog_utils/pgsexceptions.py | 116 ++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 pgscatalog_utils/pgsexceptions.py

diff --git a/pgscatalog_utils/pgsexceptions.py b/pgscatalog_utils/pgsexceptions.py
new file mode 100644
index 0000000..e57e4bf
--- /dev/null
+++ b/pgscatalog_utils/pgsexceptions.py
@@ -0,0 +1,116 @@
+""" This module defines a custom PGS exception hierarchy. There's a lot of exceptions for specific failure states,
+which can be a bad approach and too complex. However, we did this anyway for a few reasons:
+
+1. There's only a few types of common errors (around a dozen, with 3-4 very common)
+2. Want to exit the program with custom exit codes to simplify communicating program
+state with external processes (e.g. PGS Catalog Calculator, web platforms) without doing
+complicated things like logging to an external location
+3. This approach should make maintaining exit codes simple
+
+So the plan is to override sys.excepthook, intercept errors defined here, and map them
+to custom exit codes defined below
+"""
+
+from types import MappingProxyType
+
+
+class BasePGSError(Exception):
+    """The base class from which all PGS errors must inherit.
+    The purpose of this class is to simplify catching PGS exceptions and exiting python with a custom exit code."""
+
+
+class MatchError(BasePGSError):
+    """The base class for errors that are raised during variant matching"""
+
+
+class CombineError(BasePGSError):
+    """The base class for errors that are raised when combining scorefiles"""
+
+
+class CatalogError(BasePGSError):
+    """The base class for errors when querying or downloading from the PGS Catalog"""
+
+
+class SamplesheetError(BasePGSError):
+    """The base class for errors related to samplesheet parsing"""
+
+
+class ScoreDownloadError(CatalogError):
+    """Raised when a scoring file can't be downloaded"""
+
+
+class ScoreChecksumError(CatalogError):
+    """Raised when a scoring file fails checksum validation"""
+
+
+class QueryError(CatalogError):
+    """Raised when the Catalog API doesn't return a valid response"""
+
+
+class InvalidAccessionError(CatalogError):
+    """Raised when an invalid term is used to query the Catalog"""
+
+
+class DuplicateMatchError(MatchError):
+    """Raised when a matched variant has been duplicated, so that a variant with the same ID
+    would be split across two rows in an output scoring file.
+    """
+
+
+class MatchRateError(MatchError):
+    """Raised when match rate is below match threshold for one or more scoring files"""
+
+
+class ZeroMatchesError(MatchError):
+    """Raised when zero matches are found for one or more scoring files.
+
+    Distinct from MatchRateError because it's very common, and caused by bad input data or parameters."""
+
+
+class MatchValueError(MatchError):
+    """Raised when a match function receives inappropriate values.
+
+    e.g., Multiple chromosomes detected in variant data but data is split per-chromosome"""
+
+
+class BuildError(CombineError):
+    """Raised when there's a problem with a scoring file genome build."""
+
+
+class ScoreFormatError(CombineError):
+    """Raised when there's a problem with a scoring file."""
+
+
+class GenomesNotFound(SamplesheetError):
+    """Raised when FileNotFound"""
+
+
+class SamplesheetFormatError(SamplesheetError):
+    """Raised when a samplesheet is badly formatted"""
+
+
+class ExceptionExitCodeMap:
+    """A read only map to get exit codes for custom exceptions"""
+
+    # https://unix.stackexchange.com/a/604262
+    _mapping = {
+        ScoreDownloadError: 8,
+        ScoreFormatError: 9,
+        ScoreChecksumError: 10,
+        QueryError: 11,
+        InvalidAccessionError: 12,
+        DuplicateMatchError: 13,
+        MatchRateError: 14,
+        ZeroMatchesError: 15,
+        MatchValueError: 16,
+        BuildError: 17,
+        GenomesNotFound: 19,
+        SamplesheetFormatError: 20,
+    }
+
+    code_map = MappingProxyType(_mapping)
+
+    def get_exit_code(self, exception_type):
+        # if an exception can't be found in the map, return an error code (> 0) but default
+        # max possible value 255
+        return self.code_map.get(exception_type, 255)

From b92615006f38a7de007600892eb6facf65c62d30 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 14 Dec 2023 10:42:32 +0000
Subject: [PATCH 34/40] add custom exit code

---
 pgscatalog_utils/pgsexceptions.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/pgscatalog_utils/pgsexceptions.py b/pgscatalog_utils/pgsexceptions.py
index e57e4bf..09b02d5 100644
--- a/pgscatalog_utils/pgsexceptions.py
+++ b/pgscatalog_utils/pgsexceptions.py
@@ -10,7 +10,7 @@
 So the plan is to override sys.excepthook, intercept errors defined here, and map them
 to custom exit codes defined below
 """
-
+import sys
 from types import MappingProxyType
 
 
@@ -110,7 +110,17 @@ class ExceptionExitCodeMap:
 
     code_map = MappingProxyType(_mapping)
 
-    def get_exit_code(self, exception_type):
+    def __getitem__(self, exception_type):
         # if an exception can't be found in the map, return an error code (> 0) but default
         # max possible value 255
         return self.code_map.get(exception_type, 255)
+
+
+def handle_uncaught_exception(exctype, value, trace):
+    code_map = ExceptionExitCodeMap()
+    oldHook(exctype, value, trace)
+    if isinstance(value, BasePGSError):
+        sys.exit(code_map[exctype])
+
+
+sys.excepthook, oldHook = handle_uncaught_exception, sys.excepthook

From 499ef557af020dd1e15ebc105ed0d36403aa25f7 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 14 Dec 2023 13:31:54 +0000
Subject: [PATCH 35/40] move class definitions

---
 pgscatalog_utils/pgsexceptions.py | 56 +++++++++++++++----------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/pgscatalog_utils/pgsexceptions.py b/pgscatalog_utils/pgsexceptions.py
index 09b02d5..393ad7a 100644
--- a/pgscatalog_utils/pgsexceptions.py
+++ b/pgscatalog_utils/pgsexceptions.py
@@ -23,34 +23,6 @@ class MatchError(BasePGSError):
     """The base class for errors that are raised during variant matching"""
 
 
-class CombineError(BasePGSError):
-    """The base class for errors that are raised when combining scorefiles"""
-
-
-class CatalogError(BasePGSError):
-    """The base class for errors when querying or downloading from the PGS Catalog"""
-
-
-class SamplesheetError(BasePGSError):
-    """The base class for errors related to samplesheet parsing"""
-
-
-class ScoreDownloadError(CatalogError):
-    """Raised when a scoring file can't be downloaded"""
-
-
-class ScoreChecksumError(CatalogError):
-    """Raised when a scoring file fails checksum validation"""
-
-
-class QueryError(CatalogError):
-    """Raised when the Catalog API doesn't return a valid response"""
-
-
-class InvalidAccessionError(CatalogError):
-    """Raised when an invalid term is used to query the Catalog"""
-
-
 class DuplicateMatchError(MatchError):
     """Raised when a matched variant has been duplicated, so that a variant with the same ID
     would be split across two rows in an output scoring file.
@@ -73,6 +45,10 @@ class MatchValueError(MatchError):
     e.g., Multiple chromosomes detected in variant data but data is split per-chromosome"""
 
 
+class CombineError(BasePGSError):
+    """The base class for errors that are raised when combining scorefiles"""
+
+
 class BuildError(CombineError):
     """Raised when there's a problem with a scoring file genome build."""
 
@@ -81,6 +57,30 @@ class ScoreFormatError(CombineError):
     """Raised when there's a problem with a scoring file."""
 
 
+class CatalogError(BasePGSError):
+    """The base class for errors when querying or downloading from the PGS Catalog"""
+
+
+class ScoreDownloadError(CatalogError):
+    """Raised when a scoring file can't be downloaded"""
+
+
+class ScoreChecksumError(CatalogError):
+    """Raised when a scoring file fails checksum validation"""
+
+
+class QueryError(CatalogError):
+    """Raised when the Catalog API doesn't return a valid response"""
+
+
+class InvalidAccessionError(CatalogError):
+    """Raised when an invalid term is used to query the Catalog"""
+
+
+class SamplesheetError(BasePGSError):
+    """The base class for errors related to samplesheet parsing"""
+
+
 class GenomesNotFound(SamplesheetError):
     """Raised when FileNotFound"""
 

From 064813a0d2e38cb79534b31253e50eec91632197 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 14 Dec 2023 13:54:33 +0000
Subject: [PATCH 36/40] rename

---
 pgscatalog_utils/pgsexceptions.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/pgscatalog_utils/pgsexceptions.py b/pgscatalog_utils/pgsexceptions.py
index 393ad7a..224025e 100644
--- a/pgscatalog_utils/pgsexceptions.py
+++ b/pgscatalog_utils/pgsexceptions.py
@@ -14,12 +14,13 @@
 from types import MappingProxyType
 
 
-class BasePGSError(Exception):
+class BasePGSException(Exception):
     """The base class from which all PGS errors must inherit.
-    The purpose of this class is to simplify catching PGS exceptions and exiting python with a custom exit code."""
+    The purpose of this class is to simplify finding PGS exceptions and exiting python
+    with a matching custom exit code."""
 
 
-class MatchError(BasePGSError):
+class MatchError(BasePGSException):
     """The base class for errors that are raised during variant matching"""
 
 
@@ -45,7 +46,7 @@ class MatchValueError(MatchError):
     e.g., Multiple chromosomes detected in variant data but data is split per-chromosome"""
 
 
-class CombineError(BasePGSError):
+class CombineError(BasePGSException):
     """The base class for errors that are raised when combining scorefiles"""
 
 
@@ -57,7 +58,7 @@ class ScoreFormatError(CombineError):
     """Raised when there's a problem with a scoring file."""
 
 
-class CatalogError(BasePGSError):
+class CatalogError(BasePGSException):
     """The base class for errors when querying or downloading from the PGS Catalog"""
 
 
@@ -77,7 +78,7 @@ class InvalidAccessionError(CatalogError):
     """Raised when an invalid term is used to query the Catalog"""
 
 
-class SamplesheetError(BasePGSError):
+class SamplesheetError(BasePGSException):
     """The base class for errors related to samplesheet parsing"""
 
 
@@ -119,7 +120,7 @@ def __getitem__(self, exception_type):
 def handle_uncaught_exception(exctype, value, trace):
     code_map = ExceptionExitCodeMap()
     oldHook(exctype, value, trace)
-    if isinstance(value, BasePGSError):
+    if isinstance(value, BasePGSException):
         sys.exit(code_map[exctype])
 
 

From 21873655c7bf961a707ec683614ace6527e3f536 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 15 Dec 2023 14:13:30 +0000
Subject: [PATCH 37/40] update effect allele class

---
 pgscatalog_utils/scorefile/effectallele.py | 43 ++++++++++++++++++----
 pgscatalog_utils/scorefile/qc.py           |  2 +-
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/pgscatalog_utils/scorefile/effectallele.py b/pgscatalog_utils/scorefile/effectallele.py
index 6f0dfcb..a72e3d1 100644
--- a/pgscatalog_utils/scorefile/effectallele.py
+++ b/pgscatalog_utils/scorefile/effectallele.py
@@ -1,10 +1,31 @@
 class EffectAllele:
-    _valid_bases = frozenset({"A", "C", "T", "G"})
-    __slots__ = ("allele", "is_valid")
+    """A class that represents an effect allele found in PGS Catalog scoring files
 
-    def __init__(self, allele: str):
-        self.allele = allele
-        self.is_valid = self.is_valid_allele()
+    The allele that's dosage is counted (e.g. {0, 1, 2}) and multiplied by the variant's
+    weight (effect_weight) when calculating score. The effect allele is also known as
+    the 'risk allele'.
+    >>> simple_ea = EffectAllele("A")
+    >>> simple_ea
+    EffectAllele("A")
+    >>> simple_ea.is_snp
+    True
+    >>> str(simple_ea)
+    'A'
+    >>> EffectAllele("AG")
+    EffectAllele("AG")
+    >>> hla_example = EffectAllele("+")
+    >>> hla_example
+    EffectAllele("+")
+    >>> hla_example.is_snp
+    False
+    """
+
+    _valid_snp_bases = frozenset({"A", "C", "T", "G"})
+    __slots__ = ("allele", "is_snp")
+
+    def __init__(self, allele):
+        self.allele = str(allele)
+        self.is_snp = self._is_snp()
 
     def __repr__(self):
         return f'{type(self).__name__}("{self.allele}")'
@@ -12,5 +33,13 @@ def __repr__(self):
     def __str__(self):
         return self.allele
 
-    def is_valid_allele(self) -> bool:
-        return not frozenset(self.allele) - self._valid_bases
+    def _is_snp(self) -> bool:
+        """SNPs are the most common type of effect allele. More complex effect
+        alleles, like HLAs or APOE genes, often require extra work to represent in
+        genomes. Users should be warned about complex effect alleles.
+        >>> EffectAllele("+")._is_snp()
+        False
+        >>> EffectAllele("A")._is_snp()
+        True
+        """
+        return not frozenset(self.allele) - self._valid_snp_bases
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 50fcb52..526fda2 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -195,7 +195,7 @@ def check_effect_allele(
 ) -> typing.Generator[ScoreVariant, None, None]:
     n_bad = 0
     for variant in variants:
-        if not variant.effect_allele.is_valid:
+        if not variant.effect_allele.is_snp:
             n_bad += 1
 
         yield variant

From 60e150b98688dfdacff92d5210d07c3e2c205bb0 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 15 Dec 2023 14:14:45 +0000
Subject: [PATCH 38/40] tidy up docstring

---
 pgscatalog_utils/scorefile/effectallele.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pgscatalog_utils/scorefile/effectallele.py b/pgscatalog_utils/scorefile/effectallele.py
index a72e3d1..be9ecb2 100644
--- a/pgscatalog_utils/scorefile/effectallele.py
+++ b/pgscatalog_utils/scorefile/effectallele.py
@@ -34,9 +34,10 @@ def __str__(self):
         return self.allele
 
     def _is_snp(self) -> bool:
-        """SNPs are the most common type of effect allele. More complex effect
-        alleles, like HLAs or APOE genes, often require extra work to represent in
-        genomes. Users should be warned about complex effect alleles.
+        """SNPs are the most common type of effect allele in PGS Catalog scoring
+        files. More complex effect alleles, like HLAs or APOE genes, often require
+        extra work to represent in genomes. Users should be warned about complex
+        effect alleles.
         >>> EffectAllele("+")._is_snp()
         False
         >>> EffectAllele("A")._is_snp()

From 9d6e258e989a39f748930122314ed74fb994b485 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 15 Dec 2023 14:33:22 +0000
Subject: [PATCH 39/40] add docstrings to pytest

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 0ea7b13..bd94cb8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,3 +45,5 @@ seaborn = "^0.12.2"
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
+[tool.pytest.ini_options]
+addopts = --doctest-modules
\ No newline at end of file

From 207ecd475030e9298970f4596b804a50cd992d1a Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 15 Dec 2023 14:34:34 +0000
Subject: [PATCH 40/40] fix pyproject

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index bd94cb8..0f82b6a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,4 +46,4 @@ requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
-addopts = --doctest-modules
\ No newline at end of file
+addopts = "--doctest-modules"
\ No newline at end of file