Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Numpy 2 testing #1237

Merged
merged 7 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/workflows/build-numpy-2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Build NumPy 2

on:
push:
pull_request:

jobs:
build:
# Scheduled runs only on the origin org
if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11"]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt -r requirements-dev.txt
pip install -U 'numpy<2.1'
- name: Run pre-commit
uses: pre-commit/[email protected]
- name: Test with pytest (numba jit disabled)
env:
NUMBA_DISABLE_JIT: 1
run: |
# avoid guvectorized functions #1194
pytest -v sgkit/tests/test_pedigree.py
pytest -v sgkit/tests/io/vcf/test_vcf_writer_utils.py
- name: Test with pytest and coverage
run: |
pytest -v --cov=sgkit --cov-report=term-missing
1 change: 0 additions & 1 deletion sgkit/tests/io/vcf/test_vcf_writer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def test_itoa_out_of_range():
[
(0.0, "0"),
(0.0001, "0"),
(0.0005, "0.001"),
(0.3, "0.3"),
(0.32, "0.32"),
(0.329, "0.329"),
Expand Down
58 changes: 10 additions & 48 deletions sgkit/tests/test_ld.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from typing import Optional

import allel
import dask.array as da
import numpy as np
import numpy.testing as npt
import pytest
from dask.dataframe import DataFrame
from hypothesis import Phase, example, given, settings
from hypothesis import strategies as st
from hypothesis.extra.numpy import arrays

Expand All @@ -27,40 +24,27 @@ def test_rogers_huff_r_between():
gnb = np.array([[0, 1, 2]])
npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
npt.assert_allclose(
allel.rogers_huff_r_between(gna, gnb),
rogers_huff_r_between(gna[0], gnb[0]),
rtol=1e-06,
)

gna = np.array([[0, 1, 2]])
gnb = np.array([[2, 1, 0]])
npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), -1.0, rtol=1e-06)
npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
npt.assert_allclose(
allel.rogers_huff_r_between(gna, gnb),
rogers_huff_r_between(gna[0], gnb[0]),
rtol=1e-06,
)

gna = np.array([[0, 0, 0]])
gnb = np.array([[1, 1, 1]])
assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
assert np.isnan(allel.rogers_huff_r_between(gna, gnb))

gna = np.array([[1, 1, 1]])
gnb = np.array([[1, 1, 1]])
assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
assert np.isnan(allel.rogers_huff_r_between(gna, gnb))

# a case which fails if fastmath=True is enabled for rogers_huff_r_between
gna = np.full((1, 49), 2)
gnb = np.full((1, 49), 2)
assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
assert np.isnan(allel.rogers_huff_r_between(gna, gnb))


def ldm_df(
Expand Down Expand Up @@ -115,7 +99,16 @@ def test_threshold():

@pytest.mark.parametrize(
"dtype",
[dtype for k, v in np.sctypes.items() for dtype in v if k in ["int", "uint"]], # type: ignore
[
np.int8,
np.int16,
np.int32,
np.int64,
np.uint8,
np.uint16,
np.uint32,
np.uint64,
],
)
def test_dtypes(dtype):
# Input matrices should work regardless of integer type
Expand Down Expand Up @@ -148,37 +141,6 @@ def ld_prune_args(draw):
return x, window, step, threshold, chunks


# Phases setting without shrinking for complex, conditional draws in
# which shrinking wastes time and adds little information
# (see https://hypothesis.readthedocs.io/en/latest/settings.html#hypothesis.settings.phases)
PHASES_NO_SHRINK = (Phase.explicit, Phase.reuse, Phase.generate, Phase.target)


@given(args=ld_prune_args()) # pylint: disable=no-value-for-parameter
@settings(max_examples=50, deadline=None, phases=PHASES_NO_SHRINK)
@example(args=(np.array([[1, 1], [1, 1]], dtype="uint8"), 1, 1, 0.0, -1))
@pytest.mark.skip(
reason="Hypothesis generates failures that need investigation: https://github.com/sgkit-dev/sgkit/issues/864"
)
def test_vs_skallel(args):
x, size, step, threshold, chunks = args

ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1])
ds["call_dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0: chunks}))
ds = window_by_variant(ds, size=size, step=step)

ldm = ld_matrix(ds, threshold=threshold)
has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any()
assert not has_duplicates
idx_drop_ds = maximal_independent_set(ldm)

idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)
m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold)
idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1))

npt.assert_equal(idx_drop_ska, idx_drop)


def test_scores():
# Create zero row vectors except for 1st and 11th
# (make them have non-zero variance)
Expand Down
3 changes: 3 additions & 0 deletions sgkit/tests/test_popgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,9 @@ def test_hash_array(n_rows, n_cols):
_, expected_inverse, expected_counts = np.unique(
x, axis=0, return_inverse=True, return_counts=True
)
# following is needed due to https://github.com/numpy/numpy/issues/26738
# (workaround from https://github.com/lmcinnes/umap/issues/1138)
expected_inverse = expected_inverse.reshape(-1)

# hash columns, then find unique column counts using the hash values
h = hash_array(x)
Expand Down
2 changes: 2 additions & 0 deletions sgkit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,8 @@ def split_array_chunks(n: int, blocks: int) -> Tuple[int, ...]:
if blocks <= 0:
raise ValueError(f"Number of blocks ({blocks}) must be >= 0")
n_div, n_mod = np.divmod(n, blocks)
n_div = int(n_div)
n_mod = int(n_mod)
chunks = n_mod * (n_div + 1,) + (blocks - n_mod) * (n_div,)
return chunks # type: ignore[no-any-return]

Expand Down
Loading