PGScatalog · nebfield · Feb 21, 2024 · Feb 20, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/pgscatalog_utils/aggregate/aggregate_scores.py b/pgscatalog_utils/aggregate/aggregate_scores.py
@@ -1,5 +1,6 @@
 import argparse
 import logging
+import pathlib
 import textwrap
 
 import pandas as pd
@@ -16,14 +17,14 @@ def aggregate_scores():
 
     if args.split:
         logger.debug("Splitting aggregated scores by sampleset")
-        for sampleset, group in df.groupby('sampleset'):
-            fout = f"{sampleset}_pgs.txt.gz"
+        for sampleset, group in df.groupby("sampleset"):
+            fout = pathlib.Path(args.outdir) / f"{sampleset}_pgs.txt.gz"
             logger.debug(f"Compressing sampleset {sampleset}, writing to {fout}")
-            group.to_csv(fout, sep='\t', compression='gzip')
+            group.to_csv(fout, sep="\t", compression="gzip")
     else:
-        fout = "aggregated_scores.txt.gz"
+        fout = pathlib.Path(args.outdir) / "aggregated_scores.txt.gz"
         logger.info(f"Compressing all samplesets and writing combined scores to {fout}")
-        df.to_csv(fout, sep='\t', compression='gzip')
+        df.to_csv(fout, sep="\t", compression="gzip")
 
 
 def aggregate(scorefiles: list[str]):
@@ -33,43 +34,71 @@ def aggregate(scorefiles: list[str]):
     for i, path in enumerate(scorefiles):
         logger.debug(f"Reading {path}")
         # pandas can automatically detect zst compression, neat!
-        df = (pd.read_table(path, converters={"#IID": str}, header=0)
-              .assign(sampleset=path.split('_')[0])
-              .set_index(['sampleset', '#IID']))
+        df = (
+            pd.read_table(path, converters={"#IID": str}, header=0)
+            .assign(sampleset=path.split("_")[0])
+            .set_index(["sampleset", "#IID"])
+        )
 
-        df.index.names = ['sampleset', 'IID']
+        df.index.names = ["sampleset", "IID"]
 
         # Subset to aggregatable columns
         df = df[_select_agg_cols(df.columns)]
         aggcols.update(set(df.columns))
 
         # Combine DFs
         if i == 0:
-            logger.debug('Initialising combined DF')
+            logger.debug("Initialising combined DF")
             combined = df.copy()
         else:
-            logger.debug('Adding to combined DF')
+            logger.debug("Adding to combined DF")
             combined = combined.add(df, fill_value=0)
 
-    assert all([x in combined.columns for x in aggcols]), "All Aggregatable Columns are present in the final DF"
+    assert all(
+        [x in combined.columns for x in aggcols]
+    ), "All Aggregatable Columns are present in the final DF"
 
-    return combined.pipe(_calculate_average)
+    sum_df, avg_df = combined.pipe(_calculate_average)
+    # need to melt sum and avg separately to give correct value_Name to melt
+    dfs = [_melt(x, y) for x, y in zip([sum_df, avg_df], ["SUM", "AVG"])]
+    # add melted average back
+    combined = pd.concat([dfs[0], dfs[1]["AVG"]], axis=1)
+    return combined[["PGS", "SUM", "DENOM", "AVG"]]
+
+
+def _melt(df, value_name):
+    df = df.melt(
+        id_vars=["DENOM"],
+        value_name=value_name,
+        var_name="PGS",
+        ignore_index=False,
+    )
+    df["PGS"] = df["PGS"].str.replace(f"_{value_name}", "")
+    return df
 
 
 def _calculate_average(combined: pd.DataFrame):
     logger.debug("Averaging data")
-    avgs = combined.loc[:, combined.columns.str.endswith('_SUM')].divide(combined['DENOM'], axis=0)
-    avgs.columns = avgs.columns.str.replace('_SUM', '_AVG')
-    return pd.concat([combined, avgs], axis=1)
+    avgs = combined.loc[:, combined.columns.str.endswith("_SUM")].divide(
+        combined["DENOM"], axis=0
+    )
+    avgs.columns = avgs.columns.str.replace("_SUM", "_AVG")
+    avgs["DENOM"] = combined["DENOM"]
+    return combined, avgs
 
 
 def _select_agg_cols(cols):
-    keep_cols = ['DENOM']
-    return [x for x in cols if (x.endswith('_SUM') and (x != 'NAMED_ALLELE_DOSAGE_SUM')) or (x in keep_cols)]
+    keep_cols = ["DENOM"]
+    return [
+        x
+        for x in cols
+        if (x.endswith("_SUM") and (x != "NAMED_ALLELE_DOSAGE_SUM")) or (x in keep_cols)
+    ]
 
 
 def _description_text() -> str:
-    return textwrap.dedent('''
+    return textwrap.dedent(
+        """
     Aggregate plink .sscore files into a combined TSV table.
 
     This aggregation sums scores that were calculated from plink
@@ -80,20 +109,45 @@ def _description_text() -> str:
     Input .sscore files can be optionally compressed with zstd or gzip. 
 
     The aggregated output scores are compressed with gzip.
-   ''')
+   """
+    )
 
 
 def _parse_args(args=None) -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description=_description_text(),
-                                     formatter_class=argparse.RawDescriptionHelpFormatter)
-    parser.add_argument('-s', '--scores', dest='scores', required=True, nargs='+',
-                        help='<Required> List of scorefile paths. Use a wildcard (*) to select multiple files.')
-    parser.add_argument('-o', '--outdir', dest='outdir', required=True,
-                        default='scores/', help='<Required> Output directory to store downloaded files')
-    parser.add_argument('--split', dest='split', required=False, action=argparse.BooleanOptionalAction,
-                        help='<Optional> Make one aggregated file per sampleset')
-    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
-                        help='<Optional> Extra logging information')
+    parser = argparse.ArgumentParser(
+        description=_description_text(),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "-s",
+        "--scores",
+        dest="scores",
+        required=True,
+        nargs="+",
+        help="<Required> List of scorefile paths. Use a wildcard (*) to select multiple files.",
+    )
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        dest="outdir",
+        required=True,
+        default="scores/",
+        help="<Required> Output directory to store downloaded files",
+    )
+    parser.add_argument(
+        "--split",
+        dest="split",
+        required=False,
+        action=argparse.BooleanOptionalAction,
+        help="<Optional> Make one aggregated file per sampleset",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        dest="verbose",
+        action="store_true",
+        help="<Optional> Extra logging information",
+    )
     return parser.parse_args(args)
 
 

diff --git a/pgscatalog_utils/ancestry/read.py b/pgscatalog_utils/ancestry/read.py
@@ -1,12 +1,10 @@
 import logging
 import pandas as pd
-import numpy as np
-import os
 
 logger = logging.getLogger(__name__)
 
 
-def read_pcs(loc_pcs: list[str],dataset: str, loc_related_ids=None, nPCs=None):
+def read_pcs(loc_pcs: list[str], dataset: str, loc_related_ids=None, nPCs=None):
     """
     Read the .pc file outputs of the fraposa_pgsc projection
     :param loc_pcs: list of locations for .pcs files
@@ -18,20 +16,20 @@ def read_pcs(loc_pcs: list[str],dataset: str, loc_related_ids=None, nPCs=None):
 
     for i, path in enumerate(loc_pcs):
         logger.debug("Reading PCA projection: {}".format(path))
-        df = pd.read_csv(path, sep='\t', converters={"IID": str}, header=0)
-        df['sampleset'] = dataset
-        df.set_index(['sampleset', 'IID'], inplace=True)
+        df = pd.read_csv(path, sep="\t", converters={"IID": str}, header=0)
+        df["sampleset"] = dataset
+        df.set_index(["sampleset", "IID"], inplace=True)
 
         if i == 0:
-            logger.debug('Initialising combined DF')
+            logger.debug("Initialising combined DF")
             proj = df.copy()
         else:
-            logger.debug('Appending to combined DF')
+            logger.debug("Appending to combined DF")
             proj = pd.concat([proj, df])
 
     # Drop PCs
     if nPCs:
-        logger.debug('Filtering to relevant PCs')
+        logger.debug("Filtering to relevant PCs")
         dropcols = []
         for x in proj.columns:
             if int(x[2:]) > nPCs:
@@ -41,47 +39,55 @@ def read_pcs(loc_pcs: list[str],dataset: str, loc_related_ids=None, nPCs=None):
     # Read/process IDs for unrelated samples (usually reference dataset)
     if loc_related_ids:
         logger.debug("Flagging related samples with: {}".format(loc_related_ids))
-        proj['Unrelated'] = True
-        with open(loc_related_ids, 'r') as infile:
+        proj["Unrelated"] = True
+        with open(loc_related_ids, "r") as infile:
             IDs_related = [x.strip() for x in infile.readlines()]
-        proj.loc[proj.index.get_level_values(level=1).isin(IDs_related), 'Unrelated'] = False
+        proj.loc[
+            proj.index.get_level_values(level=1).isin(IDs_related), "Unrelated"
+        ] = False
     else:
         # if unrelated is all nan -> dtype is float64
         # if unrelated is only true / false -> dtype is bool
         # if unrelated contains None, dtype stays bool, and pd.concat warning disappears
-        proj['Unrelated'] = None
+        proj["Unrelated"] = None
 
     return proj
 
 
-def extract_ref_psam_cols(loc_psam, dataset: str, df_target, keepcols=['SuperPop', 'Population']):
-    psam = pd.read_csv(loc_psam, sep='\t', header=0)
+def extract_ref_psam_cols(
+    loc_psam, dataset: str, df_target, keepcols=["SuperPop", "Population"]
+):
+    psam = pd.read_csv(loc_psam, sep="\t", header=0)
 
-    match (psam.columns[0]):
+    match psam.columns[0]:
         # handle case of #IID -> IID (happens when #FID is present)
-        case '#IID':
-            psam.rename({'#IID': 'IID'}, axis=1, inplace=True)
-        case '#FID':
-            psam.drop(['#FID'], axis=1, inplace=True)
+        case "#IID":
+            psam.rename({"#IID": "IID"}, axis=1, inplace=True)
+        case "#FID":
+            psam.drop(["#FID"], axis=1, inplace=True)
         case _:
             assert False, "Invalid columns"
-    psam['sampleset'] = dataset
-    psam.set_index(['sampleset', 'IID'], inplace=True)
+    psam["sampleset"] = dataset
+    psam.set_index(["sampleset", "IID"], inplace=True)
 
     return pd.merge(df_target, psam[keepcols], left_index=True, right_index=True)
 
 
-def read_pgs(loc_aggscore, onlySUM: bool):
+def read_pgs(loc_aggscore):
     """
-    Function to read the output of aggreagte_scores
+    Function to read the PGS SUM from the output of aggreagte_scores
     :param loc_aggscore: path to aggregated scores output
-    :param onlySUM: whether to return only _SUM columns (e.g. not _AVG)
-    :return:
+    :return: df with PGS SUM indexed by sampleset and IID
     """
-    logger.debug('Reading aggregated score data: {}'.format(loc_aggscore))
-    df = pd.read_csv(loc_aggscore, sep='\t', index_col=['sampleset', 'IID'], converters={"IID": str}, header=0)
-    if onlySUM:
-        df = df[[x for x in df.columns if x.endswith('_SUM')]]
-        rn = [x.rstrip('_SUM') for x in df.columns]
-        df.columns = rn
-    return df
+    logger.debug("Reading aggregated score data: {}".format(loc_aggscore))
+    df = pd.read_csv(
+        loc_aggscore,
+        sep="\t",
+        index_col=["sampleset", "IID"],
+        converters={"IID": str},
+        header=0,
+    ).pivot(columns=["PGS"], values=["SUM"])
+    # rename to PGS only
+    df.columns = [f"{j}" for i, j in df.columns]
+
+    return df
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pgscatalog_utils"
-version = "0.5.0"
+version = "0.5.1"
 description = "Utilities for working with PGS Catalog API and scoring files"
 homepage = "https://github.com/PGScatalog/pgscatalog_utils"
 authors = ["Benjamin Wingfield <[email protected]>", "Samuel Lambert <[email protected]>", "Laurent Gil <[email protected]>"]

diff --git a/tests/data/cineca_22_additive_0.sscore.zst b/tests/data/cineca_22_additive_0.sscore.zst
diff --git a/tests/test_aggregate.py b/tests/test_aggregate.py
@@ -0,0 +1,22 @@
+import importlib.resources
+import os
+import pandas as pd
+from unittest.mock import patch
+
+from pgscatalog_utils.aggregate.aggregate_scores import aggregate_scores
+from . import data
+
+
+def test_aggregate(tmp_path_factory):
+    out_dir = tmp_path_factory.mktemp("aggregated")
+    score_path = importlib.resources.files(data) / "cineca_22_additive_0.sscore.zst"
+
+    args = ["aggregate_scores", "-s", str(score_path), "-o", str(out_dir)]
+
+    with patch("sys.argv", args):
+        aggregate_scores()
+
+    assert os.listdir(out_dir) == ["aggregated_scores.txt.gz"]
+    df = pd.read_csv(out_dir / "aggregated_scores.txt.gz", delimiter="\t")
+    assert list(df.columns) == ["sampleset", "IID", "PGS", "SUM", "DENOM", "AVG"]
+    assert df.shape == (2504, 6)