Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor combine_scorefiles #62

Merged
merged 41 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
52a4eaa
draft streaming with generators
nebfield Oct 30, 2023
65b2b79
set up effect types
nebfield Oct 30, 2023
227f32f
profiling improvements
nebfield Oct 31, 2023
1349d0a
fix output
nebfield Oct 31, 2023
669eb8d
check for duplicates
nebfield Nov 1, 2023
bbfffbc
add liftover
nebfield Nov 1, 2023
43046c7
update dependencies and set up pre-commit
nebfield Nov 1, 2023
a947a39
complain when linting fails
nebfield Nov 1, 2023
32ef39c
fix linting
nebfield Nov 1, 2023
eb9d362
support wide files
nebfield Nov 2, 2023
1774e01
add log
nebfield Nov 2, 2023
7cf9562
fix tests and liftover
nebfield Nov 3, 2023
032b1f7
fix test
nebfield Nov 3, 2023
7864b7d
sqlite support and add log data
nebfield Nov 3, 2023
afdc0b1
fix tests
nebfield Nov 6, 2023
1302d7a
fix tests
nebfield Nov 6, 2023
953497a
fixes to make old and new output consistent
nebfield Nov 6, 2023
cf3fc8b
update tests
nebfield Nov 6, 2023
d0fcb8d
drop parallel gzip and --threads
nebfield Nov 7, 2023
e9e06e1
create ScoreVariant and EffectType classes
nebfield Nov 16, 2023
eef2da6
review comments
nebfield Nov 16, 2023
ee16684
add type hints
nebfield Nov 16, 2023
6be1dd3
remove coordinates from mandatory fields
nebfield Nov 16, 2023
ea16f4e
fix old scoring files
nebfield Nov 20, 2023
7c4c848
check effect alleles and compelx scoring files
nebfield Nov 20, 2023
51aa0f2
don't access __annotations__ directly
nebfield Nov 21, 2023
9938bbd
remove logger
nebfield Nov 21, 2023
af1eef2
warn about complex files and variant mismatch
nebfield Nov 22, 2023
54fd6ea
refactor scorevariant from userdict to class with __slots__
nebfield Dec 6, 2023
42a580f
fix __repr__ and type hints
nebfield Dec 7, 2023
3c04445
add pyarrow support
nebfield Dec 11, 2023
980940f
add license data to log
nebfield Dec 13, 2023
7d39e08
add custom exceptions
nebfield Dec 14, 2023
b926150
add custom exit code
nebfield Dec 14, 2023
499ef55
move class definitions
nebfield Dec 14, 2023
064813a
rename
nebfield Dec 14, 2023
2187365
update effect allele class
nebfield Dec 15, 2023
60e150b
tidy up docstring
nebfield Dec 15, 2023
9d6e258
add docstrings to pytest
nebfield Dec 15, 2023
207ecd4
fix pyproject
nebfield Dec 15, 2023
f2766d2
Merge pull request #73 from PGScatalog/exit_codes
nebfield Dec 15, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,5 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
.idea/
.DS_Store
8 changes: 8 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.1.3
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- id: ruff-format
180 changes: 83 additions & 97 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import glob
import importlib.resources
import os
import pathlib
import shutil
from unittest.mock import patch

import pandas as pd
import polars as pl
import pytest
import requests as req
Expand All @@ -14,20 +12,51 @@
from pgscatalog_utils.match.preprocess import complement_valid_alleles
from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles

from tests.data import combine

pl.toggle_string_cache(True)


@pytest.fixture(scope="session")
def pgs_accessions():
return ['PGS001229', 'PGS000922']
return ["PGS001229", "PGS000922"]


@pytest.fixture(scope="session")
def mini_score_path(tmp_path_factory):
path = importlib.resources.files(combine) / "PGS001229_22.txt"
return str(path)


@pytest.fixture(scope="session")
def mini_scorefile(mini_score_path, tmp_path_factory):
# The mini scorefile overlaps well with cineca synthetic subset
out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
args: list[str] = (
["combine_scorefiles", "-t", "GRCh37", "-s"]
+ [mini_score_path]
+ ["-o", str(out_path.resolve())]
)

with patch("sys.argv", args):
combine_scorefiles()

return str(out_path.resolve())


@pytest.fixture(scope="session")
def scorefiles(tmp_path_factory, pgs_accessions):
fn = tmp_path_factory.mktemp("scorefiles")
args: list[str] = ['download_scorefiles', '-b', 'GRCh37', '-o', str(fn.resolve()), '-i'] + pgs_accessions

with patch('sys.argv', args):
args: list[str] = [
"download_scorefiles",
"-b",
"GRCh37",
"-o",
str(fn.resolve()),
"-i",
] + pgs_accessions

with patch("sys.argv", args):
download_scorefile()

return glob.glob(os.path.join(fn.resolve(), "*.txt.gz"))
Expand All @@ -37,138 +66,95 @@ def scorefiles(tmp_path_factory, pgs_accessions):
def target_path(tmp_path_factory):
try:
bim = req.get(
'https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim',
timeout=5)
"https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim",
timeout=5,
)
except (req.exceptions.ConnectionError, req.Timeout):
bim = []

if not bim:
pytest.skip("Couldn't get test data from network")
else:
fn = tmp_path_factory.mktemp("target") / "data.bim"
with open(fn, 'wb') as f:
with open(fn, "wb") as f:
f.write(bim.content)

return str(fn.resolve())


@pytest.fixture(scope="session")
def mini_score_path(tmp_path_factory):
try:
score = req.get('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/PGS001229_22.txt',
timeout=5)
except (req.exceptions.ConnectionError, req.Timeout):
score = []

if not score:
pytest.skip("Couldn't get test data from network")
else:
fn = tmp_path_factory.mktemp("score") / "PGS001229_22.txt"
with open(fn, 'wb') as f:
f.write(score.content)

return str(fn.resolve())


@pytest.fixture(scope="session")
def mini_scorefile(mini_score_path, tmp_path_factory):
# The mini scorefile overlaps well with cineca synthetic subset
out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()

return str(out_path.resolve())


@pytest.fixture(scope="session")
def combined_scorefile(scorefiles, tmp_path_factory):
# The combined scorefile overlaps poorly with cineca synthetic subset
out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + scorefiles + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()

return str(out_path.resolve())


@pytest.fixture(scope="session")
def chain_files(tmp_path_factory):
chain_dir = tmp_path_factory.mktemp('chain_dir')
chain_dir = tmp_path_factory.mktemp("chain_dir")

shutil.copy2("tests/data/hg19ToHg38.over.chain.gz", chain_dir)
shutil.copy2("tests/data/hg38ToHg19.over.chain.gz", chain_dir)

return str(chain_dir.resolve())


@pytest.fixture(scope="session")
def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory):
out_path = tmp_path_factory.mktemp("scores") / "lifted.txt"
args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t',
'GRCh38',
'-m', '0.8'] + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()

return str(out_path.resolve())
return str(chain_dir.resolve())


@pytest.fixture(scope="session")
def hg38_coords():
d = {'rsid': ['rs11903757', 'rs6061231'], 'chr_name': ['2', '20'], 'chr_position': [191722478, 62381861]}
df = pd.DataFrame(d)
df['accession'] = 'dummy'
df['genome_build'] = 'GRCh38'
return df
rs11903757 = {"rsid": "rs11903757", "chr_name": "2", "chr_position": 191722478}
rs6061231 = {"rsid": "rs6061231", "chr_name": "20", "chr_position": 62381861}
return [rs11903757, rs6061231]


@pytest.fixture(scope="session")
def hg19_coords(hg38_coords):
def hg19_coords():
# hg38_coords in GRCh37, from dbSNP
d = {'lifted_chr': ['2', '20'], 'lifted_pos': [192587204, 60956917], 'liftover': [True, True]}
return pd.DataFrame(d)
rs11903757 = {"rsid": "rs11903757", "chr_name": "2", "chr_position": 192587204}
rs6061231 = {"rsid": "rs6061231", "chr_name": "20", "chr_position": 60956917}
return [rs11903757, rs6061231]


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_flipped_scorefile(small_scorefile):
# simulate a scorefile on the wrong strand
return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele'])
.drop(['effect_allele', 'other_allele'])
.rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'})
.pipe(complement_valid_alleles, ['effect_allele', 'other_allele']))
return (
complement_valid_alleles(small_scorefile, ["effect_allele", "other_allele"])
.drop(["effect_allele", "other_allele"])
.rename(
{"effect_allele_FLIP": "effect_allele", "other_allele_FLIP": "other_allele"}
)
.pipe(complement_valid_alleles, ["effect_allele", "other_allele"])
)


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_target():
return pl.DataFrame({"#CHROM": [1, 2, 3],
"POS": [1, 2, 3],
"REF": ["A", "T", "T"],
"ALT": ["C", "A", "G"],
"ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
"is_multiallelic": [False, False, False]})
return pl.DataFrame(
{
"#CHROM": [1, 2, 3],
"POS": [1, 2, 3],
"REF": ["A", "T", "T"],
"ALT": ["C", "A", "G"],
"ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
"is_multiallelic": [False, False, False],
}
)


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_scorefile():
df = pl.DataFrame({"accession": ["test", "test", "test"],
"row_nr": [1, 2, 3],
"chr_name": [1, 2, 3],
"chr_position": [1, 2, 3],
"effect_allele": ["A", "A", "G"],
"other_allele": ["C", "T", "T"],
"effect_weight": [1, 2, 3],
"effect_type": ["additive", "additive", "additive"]})
df = pl.DataFrame(
{
"accession": ["test", "test", "test"],
"row_nr": [1, 2, 3],
"chr_name": [1, 2, 3],
"chr_position": [1, 2, 3],
"effect_allele": ["A", "A", "G"],
"other_allele": ["C", "T", "T"],
"effect_weight": [1, 2, 3],
"effect_type": ["additive", "additive", "additive"],
}
)

return complement_valid_alleles(df, ["effect_allele", "other_allele"])


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_scorefile_no_oa(small_scorefile):
return small_scorefile.with_column(pl.lit(None).alias('other_allele'))
return small_scorefile.with_column(pl.lit(None).alias("other_allele"))


def _get_timeout(url):
Expand Down
25 changes: 22 additions & 3 deletions pgscatalog_utils/download/GenomeBuild.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
from enum import Enum, auto
from enum import Enum


class GenomeBuild(Enum):
GRCh37 = auto()
GRCh38 = auto()
GRCh37 = "GRCh37"
GRCh38 = "GRCh38"
# just included to handle older files, incompatible unless harmonised:
NCBI36 = "NCBI36" # ew

def __str__(self):
return str(self.value)

@classmethod
def from_string(cls, build):
match build:
case "GRCh37" | "hg19":
return cls(GenomeBuild.GRCh37)
case "GRCh38" | "hg38":
return cls(GenomeBuild.GRCh38)
case "NR":
return None
case "NCBI36" | "hg18":
return cls(GenomeBuild.NCBI36)
case _:
raise Exception(f"Can't match {build=}")
Loading