Skip to content

Commit

Permalink
Merge pull request #62 from PGScatalog/stream_combine
Browse files Browse the repository at this point in the history
Refactor combine_scorefiles
  • Loading branch information
nebfield authored Dec 15, 2023
2 parents 7782112 + f2766d2 commit c6605e8
Show file tree
Hide file tree
Showing 27 changed files with 4,146 additions and 703 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,5 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
.idea/
.DS_Store
8 changes: 8 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.1.3
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- id: ruff-format
180 changes: 83 additions & 97 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import glob
import importlib.resources
import os
import pathlib
import shutil
from unittest.mock import patch

import pandas as pd
import polars as pl
import pytest
import requests as req
Expand All @@ -14,20 +12,51 @@
from pgscatalog_utils.match.preprocess import complement_valid_alleles
from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles

from tests.data import combine

pl.toggle_string_cache(True)


@pytest.fixture(scope="session")
def pgs_accessions():
return ['PGS001229', 'PGS000922']
return ["PGS001229", "PGS000922"]


@pytest.fixture(scope="session")
def mini_score_path(tmp_path_factory):
path = importlib.resources.files(combine) / "PGS001229_22.txt"
return str(path)


@pytest.fixture(scope="session")
def mini_scorefile(mini_score_path, tmp_path_factory):
# The mini scorefile overlaps well with cineca synthetic subset
out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
args: list[str] = (
["combine_scorefiles", "-t", "GRCh37", "-s"]
+ [mini_score_path]
+ ["-o", str(out_path.resolve())]
)

with patch("sys.argv", args):
combine_scorefiles()

return str(out_path.resolve())


@pytest.fixture(scope="session")
def scorefiles(tmp_path_factory, pgs_accessions):
fn = tmp_path_factory.mktemp("scorefiles")
args: list[str] = ['download_scorefiles', '-b', 'GRCh37', '-o', str(fn.resolve()), '-i'] + pgs_accessions

with patch('sys.argv', args):
args: list[str] = [
"download_scorefiles",
"-b",
"GRCh37",
"-o",
str(fn.resolve()),
"-i",
] + pgs_accessions

with patch("sys.argv", args):
download_scorefile()

return glob.glob(os.path.join(fn.resolve(), "*.txt.gz"))
Expand All @@ -37,138 +66,95 @@ def scorefiles(tmp_path_factory, pgs_accessions):
def target_path(tmp_path_factory):
try:
bim = req.get(
'https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim',
timeout=5)
"https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim",
timeout=5,
)
except (req.exceptions.ConnectionError, req.Timeout):
bim = []

if not bim:
pytest.skip("Couldn't get test data from network")
else:
fn = tmp_path_factory.mktemp("target") / "data.bim"
with open(fn, 'wb') as f:
with open(fn, "wb") as f:
f.write(bim.content)

return str(fn.resolve())


@pytest.fixture(scope="session")
def mini_score_path(tmp_path_factory):
try:
score = req.get('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/PGS001229_22.txt',
timeout=5)
except (req.exceptions.ConnectionError, req.Timeout):
score = []

if not score:
pytest.skip("Couldn't get test data from network")
else:
fn = tmp_path_factory.mktemp("score") / "PGS001229_22.txt"
with open(fn, 'wb') as f:
f.write(score.content)

return str(fn.resolve())


@pytest.fixture(scope="session")
def mini_scorefile(mini_score_path, tmp_path_factory):
# The mini scorefile overlaps well with cineca synthetic subset
out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()

return str(out_path.resolve())


@pytest.fixture(scope="session")
def combined_scorefile(scorefiles, tmp_path_factory):
# The combined scorefile overlaps poorly with cineca synthetic subset
out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + scorefiles + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()

return str(out_path.resolve())


@pytest.fixture(scope="session")
def chain_files(tmp_path_factory):
chain_dir = tmp_path_factory.mktemp('chain_dir')
chain_dir = tmp_path_factory.mktemp("chain_dir")

shutil.copy2("tests/data/hg19ToHg38.over.chain.gz", chain_dir)
shutil.copy2("tests/data/hg38ToHg19.over.chain.gz", chain_dir)

return str(chain_dir.resolve())


@pytest.fixture(scope="session")
def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory):
out_path = tmp_path_factory.mktemp("scores") / "lifted.txt"
args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t',
'GRCh38',
'-m', '0.8'] + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()

return str(out_path.resolve())
return str(chain_dir.resolve())


@pytest.fixture(scope="session")
def hg38_coords():
d = {'rsid': ['rs11903757', 'rs6061231'], 'chr_name': ['2', '20'], 'chr_position': [191722478, 62381861]}
df = pd.DataFrame(d)
df['accession'] = 'dummy'
df['genome_build'] = 'GRCh38'
return df
rs11903757 = {"rsid": "rs11903757", "chr_name": "2", "chr_position": 191722478}
rs6061231 = {"rsid": "rs6061231", "chr_name": "20", "chr_position": 62381861}
return [rs11903757, rs6061231]


@pytest.fixture(scope="session")
def hg19_coords(hg38_coords):
def hg19_coords():
# hg38_coords in GRCh37, from dbSNP
d = {'lifted_chr': ['2', '20'], 'lifted_pos': [192587204, 60956917], 'liftover': [True, True]}
return pd.DataFrame(d)
rs11903757 = {"rsid": "rs11903757", "chr_name": "2", "chr_position": 192587204}
rs6061231 = {"rsid": "rs6061231", "chr_name": "20", "chr_position": 60956917}
return [rs11903757, rs6061231]


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_flipped_scorefile(small_scorefile):
# simulate a scorefile on the wrong strand
return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele'])
.drop(['effect_allele', 'other_allele'])
.rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'})
.pipe(complement_valid_alleles, ['effect_allele', 'other_allele']))
return (
complement_valid_alleles(small_scorefile, ["effect_allele", "other_allele"])
.drop(["effect_allele", "other_allele"])
.rename(
{"effect_allele_FLIP": "effect_allele", "other_allele_FLIP": "other_allele"}
)
.pipe(complement_valid_alleles, ["effect_allele", "other_allele"])
)


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_target():
return pl.DataFrame({"#CHROM": [1, 2, 3],
"POS": [1, 2, 3],
"REF": ["A", "T", "T"],
"ALT": ["C", "A", "G"],
"ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
"is_multiallelic": [False, False, False]})
return pl.DataFrame(
{
"#CHROM": [1, 2, 3],
"POS": [1, 2, 3],
"REF": ["A", "T", "T"],
"ALT": ["C", "A", "G"],
"ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
"is_multiallelic": [False, False, False],
}
)


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_scorefile():
df = pl.DataFrame({"accession": ["test", "test", "test"],
"row_nr": [1, 2, 3],
"chr_name": [1, 2, 3],
"chr_position": [1, 2, 3],
"effect_allele": ["A", "A", "G"],
"other_allele": ["C", "T", "T"],
"effect_weight": [1, 2, 3],
"effect_type": ["additive", "additive", "additive"]})
df = pl.DataFrame(
{
"accession": ["test", "test", "test"],
"row_nr": [1, 2, 3],
"chr_name": [1, 2, 3],
"chr_position": [1, 2, 3],
"effect_allele": ["A", "A", "G"],
"other_allele": ["C", "T", "T"],
"effect_weight": [1, 2, 3],
"effect_type": ["additive", "additive", "additive"],
}
)

return complement_valid_alleles(df, ["effect_allele", "other_allele"])


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_scorefile_no_oa(small_scorefile):
return small_scorefile.with_column(pl.lit(None).alias('other_allele'))
return small_scorefile.with_column(pl.lit(None).alias("other_allele"))


def _get_timeout(url):
Expand Down
25 changes: 22 additions & 3 deletions pgscatalog_utils/download/GenomeBuild.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
from enum import Enum, auto
from enum import Enum


class GenomeBuild(Enum):
GRCh37 = auto()
GRCh38 = auto()
GRCh37 = "GRCh37"
GRCh38 = "GRCh38"
# just included to handle older files, incompatible unless harmonised:
NCBI36 = "NCBI36" # ew

def __str__(self):
return str(self.value)

@classmethod
def from_string(cls, build):
match build:
case "GRCh37" | "hg19":
return cls(GenomeBuild.GRCh37)
case "GRCh38" | "hg38":
return cls(GenomeBuild.GRCh38)
case "NR":
return None
case "NCBI36" | "hg18":
return cls(GenomeBuild.NCBI36)
case _:
raise Exception(f"Can't match {build=}")
Loading

0 comments on commit c6605e8

Please sign in to comment.