Skip to content

Commit

Permalink
v0.5.0 (#78)
Browse files Browse the repository at this point in the history
* draft streaming with generators

* set up effect types

* profiling improvements

* fix output

* check for duplicates

* add liftover

* update dependencies and set up pre-commit

* complain when linting fails

* fix linting

* support wide files

* add log

* fix tests and liftover

* fix test

* sqlite support and add log data

* fix tests

* fix tests

* fixes to make old and new output consistent

* update tests

* drop parallel gzip and --threads

* create ScoreVariant and EffectType classes

* review comments

* add type hints

* remove coordinates from mandatory fields

* fix old scoring files

* check effect alleles and compelx scoring files

* don't access __annotations__ directly

* remove logger

* warn about complex files and variant mismatch

* refactor scorevariant from userdict to class with __slots__

* fix __repr__ and type hints

* add pyarrow support

* add license data to log

* add custom exceptions

* add custom exit code

* move class definitions

* rename

* update effect allele class

* tidy up docstring

* add docstrings to pytest

* fix pyproject

* Make sure that IID isn't converted to numeric during aggreation

Signed-off-by: smlmbrt <[email protected]>

* bump minor version

* dynamically set is_snp

* remove samplesheet package

* delete samplesheet tests

* fix liftover

* set up local venv

* fix liftover test

* improve comment

---------

Signed-off-by: smlmbrt <[email protected]>
Co-authored-by: smlmbrt <[email protected]>
  • Loading branch information
nebfield and smlmbrt authored Feb 19, 2024
1 parent 6da7eb0 commit c672be7
Show file tree
Hide file tree
Showing 32 changed files with 4,200 additions and 1,182 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,5 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
.idea/
.DS_Store
8 changes: 8 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.1.3
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- id: ruff-format
221 changes: 124 additions & 97 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,63 @@
import glob
import importlib.resources
import os
import pathlib
import shutil
from unittest.mock import patch

import pandas as pd
import polars as pl
import pytest
import requests as req

from pgscatalog_utils.download.download_scorefile import download_scorefile
from pgscatalog_utils.match.preprocess import complement_valid_alleles
from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
from pgscatalog_utils.scorefile.scorevariant import ScoreVariant

from tests.data import combine

pl.toggle_string_cache(True)


@pytest.fixture(scope="session")
def pgs_accessions():
return ['PGS001229', 'PGS000922']
return ["PGS001229", "PGS000922"]


@pytest.fixture(scope="session")
def mini_score_path(tmp_path_factory):
path = importlib.resources.files(combine) / "PGS001229_22.txt"
return str(path)


@pytest.fixture(scope="session")
def mini_scorefile(mini_score_path, tmp_path_factory):
# The mini scorefile overlaps well with cineca synthetic subset
out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
args: list[str] = (
["combine_scorefiles", "-t", "GRCh37", "-s"]
+ [mini_score_path]
+ ["-o", str(out_path.resolve())]
)

with patch("sys.argv", args):
combine_scorefiles()

return str(out_path.resolve())


@pytest.fixture(scope="session")
def scorefiles(tmp_path_factory, pgs_accessions):
fn = tmp_path_factory.mktemp("scorefiles")
args: list[str] = ['download_scorefiles', '-b', 'GRCh37', '-o', str(fn.resolve()), '-i'] + pgs_accessions

with patch('sys.argv', args):
args: list[str] = [
"download_scorefiles",
"-b",
"GRCh37",
"-o",
str(fn.resolve()),
"-i",
] + pgs_accessions

with patch("sys.argv", args):
download_scorefile()

return glob.glob(os.path.join(fn.resolve(), "*.txt.gz"))
Expand All @@ -37,138 +67,135 @@ def scorefiles(tmp_path_factory, pgs_accessions):
def target_path(tmp_path_factory):
try:
bim = req.get(
'https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim',
timeout=5)
"https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim",
timeout=5,
)
except (req.exceptions.ConnectionError, req.Timeout):
bim = []

if not bim:
pytest.skip("Couldn't get test data from network")
else:
fn = tmp_path_factory.mktemp("target") / "data.bim"
with open(fn, 'wb') as f:
with open(fn, "wb") as f:
f.write(bim.content)

return str(fn.resolve())


@pytest.fixture(scope="session")
def mini_score_path(tmp_path_factory):
try:
score = req.get('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/PGS001229_22.txt',
timeout=5)
except (req.exceptions.ConnectionError, req.Timeout):
score = []

if not score:
pytest.skip("Couldn't get test data from network")
else:
fn = tmp_path_factory.mktemp("score") / "PGS001229_22.txt"
with open(fn, 'wb') as f:
f.write(score.content)

return str(fn.resolve())


@pytest.fixture(scope="session")
def mini_scorefile(mini_score_path, tmp_path_factory):
# The mini scorefile overlaps well with cineca synthetic subset
out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()

return str(out_path.resolve())


@pytest.fixture(scope="session")
def combined_scorefile(scorefiles, tmp_path_factory):
# The combined scorefile overlaps poorly with cineca synthetic subset
out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + scorefiles + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()

return str(out_path.resolve())


@pytest.fixture(scope="session")
def chain_files(tmp_path_factory):
chain_dir = tmp_path_factory.mktemp('chain_dir')
chain_dir = tmp_path_factory.mktemp("chain_dir")

shutil.copy2("tests/data/hg19ToHg38.over.chain.gz", chain_dir)
shutil.copy2("tests/data/hg38ToHg19.over.chain.gz", chain_dir)

return str(chain_dir.resolve())


@pytest.fixture(scope="session")
def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory):
out_path = tmp_path_factory.mktemp("scores") / "lifted.txt"
args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t',
'GRCh38',
'-m', '0.8'] + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()

return str(out_path.resolve())
return str(chain_dir.resolve())


@pytest.fixture(scope="session")
def hg38_coords():
d = {'rsid': ['rs11903757', 'rs6061231'], 'chr_name': ['2', '20'], 'chr_position': [191722478, 62381861]}
df = pd.DataFrame(d)
df['accession'] = 'dummy'
df['genome_build'] = 'GRCh38'
return df
rs11903757 = ScoreVariant(
**{
"rsid": "rs11903757",
"chr_name": "2",
"chr_position": 191722478,
"row_nr": 0,
"effect_weight": 1,
"accession": "test",
"effect_allele": "A",
}
)
rs6061231 = ScoreVariant(
**{
"rsid": "rs6061231",
"chr_name": "20",
"chr_position": 62381861,
"row_nr": 1,
"effect_weight": 1,
"accession": "test",
"effect_allele": "A",
}
)
return (x for x in [rs11903757, rs6061231])


@pytest.fixture(scope="session")
def hg19_coords(hg38_coords):
def hg19_coords():
# hg38_coords in GRCh37, from dbSNP
d = {'lifted_chr': ['2', '20'], 'lifted_pos': [192587204, 60956917], 'liftover': [True, True]}
return pd.DataFrame(d)
rs11903757 = ScoreVariant(
**{
"rsid": "rs11903757",
"chr_name": "2",
"chr_position": 192587204,
"row_nr": 0,
"effect_weight": 1,
"accession": "test",
"effect_allele": "A",
}
)
rs6061231 = ScoreVariant(
**{
"rsid": "rs6061231",
"chr_name": "20",
"chr_position": 60956917,
"row_nr": 1,
"effect_weight": 1,
"accession": "test",
"effect_allele": "A",
}
)
return (x for x in [rs11903757, rs6061231])


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_flipped_scorefile(small_scorefile):
# simulate a scorefile on the wrong strand
return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele'])
.drop(['effect_allele', 'other_allele'])
.rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'})
.pipe(complement_valid_alleles, ['effect_allele', 'other_allele']))
return (
complement_valid_alleles(small_scorefile, ["effect_allele", "other_allele"])
.drop(["effect_allele", "other_allele"])
.rename(
{"effect_allele_FLIP": "effect_allele", "other_allele_FLIP": "other_allele"}
)
.pipe(complement_valid_alleles, ["effect_allele", "other_allele"])
)


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_target():
return pl.DataFrame({"#CHROM": [1, 2, 3],
"POS": [1, 2, 3],
"REF": ["A", "T", "T"],
"ALT": ["C", "A", "G"],
"ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
"is_multiallelic": [False, False, False]})
return pl.DataFrame(
{
"#CHROM": [1, 2, 3],
"POS": [1, 2, 3],
"REF": ["A", "T", "T"],
"ALT": ["C", "A", "G"],
"ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
"is_multiallelic": [False, False, False],
}
)


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_scorefile():
df = pl.DataFrame({"accession": ["test", "test", "test"],
"row_nr": [1, 2, 3],
"chr_name": [1, 2, 3],
"chr_position": [1, 2, 3],
"effect_allele": ["A", "A", "G"],
"other_allele": ["C", "T", "T"],
"effect_weight": [1, 2, 3],
"effect_type": ["additive", "additive", "additive"]})
df = pl.DataFrame(
{
"accession": ["test", "test", "test"],
"row_nr": [1, 2, 3],
"chr_name": [1, 2, 3],
"chr_position": [1, 2, 3],
"effect_allele": ["A", "A", "G"],
"other_allele": ["C", "T", "T"],
"effect_weight": [1, 2, 3],
"effect_type": ["additive", "additive", "additive"],
}
)

return complement_valid_alleles(df, ["effect_allele", "other_allele"])


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def small_scorefile_no_oa(small_scorefile):
return small_scorefile.with_column(pl.lit(None).alias('other_allele'))
return small_scorefile.with_column(pl.lit(None).alias("other_allele"))


def _get_timeout(url):
Expand Down
2 changes: 1 addition & 1 deletion pgscatalog_utils/aggregate/aggregate_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def aggregate(scorefiles: list[str]):
for i, path in enumerate(scorefiles):
logger.debug(f"Reading {path}")
# pandas can automatically detect zst compression, neat!
df = (pd.read_table(path)
df = (pd.read_table(path, converters={"#IID": str}, header=0)
.assign(sampleset=path.split('_')[0])
.set_index(['sampleset', '#IID']))

Expand Down
25 changes: 22 additions & 3 deletions pgscatalog_utils/download/GenomeBuild.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
from enum import Enum, auto
from enum import Enum


class GenomeBuild(Enum):
GRCh37 = auto()
GRCh38 = auto()
GRCh37 = "GRCh37"
GRCh38 = "GRCh38"
# just included to handle older files, incompatible unless harmonised:
NCBI36 = "NCBI36" # ew

def __str__(self):
return str(self.value)

@classmethod
def from_string(cls, build):
match build:
case "GRCh37" | "hg19":
return cls(GenomeBuild.GRCh37)
case "GRCh38" | "hg38":
return cls(GenomeBuild.GRCh38)
case "NR":
return None
case "NCBI36" | "hg18":
return cls(GenomeBuild.NCBI36)
case _:
raise Exception(f"Can't match {build=}")
Loading

0 comments on commit c672be7

Please sign in to comment.