sgkit-dev · tomwhite · Sep 2, 2024 · Jun 24, 2024 · Jun 25, 2024 · Jun 25, 2024
diff --git a/.github/workflows/build-numpy-2.yml b/.github/workflows/build-numpy-2.yml
@@ -0,0 +1,38 @@
+name: Build NumPy 2
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  build:
+    # Scheduled runs only on the origin org
+    if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11"]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt -r requirements-dev.txt
+        pip install -U 'numpy<2.1'
+    - name: Run pre-commit
+      uses: pre-commit/[email protected]
+    - name: Test with pytest (numba jit disabled)
+      env:
+        NUMBA_DISABLE_JIT: 1
+      run: |
+        # avoid guvectorized functions #1194
+        pytest -v sgkit/tests/test_pedigree.py
+        pytest -v sgkit/tests/io/vcf/test_vcf_writer_utils.py
+    - name: Test with pytest and coverage
+      run: |
+        pytest -v --cov=sgkit --cov-report=term-missing
diff --git a/sgkit/tests/io/vcf/test_vcf_writer_utils.py b/sgkit/tests/io/vcf/test_vcf_writer_utils.py
@@ -66,7 +66,6 @@ def test_itoa_out_of_range():
     [
         (0.0, "0"),
         (0.0001, "0"),
-        (0.0005, "0.001"),
         (0.3, "0.3"),
         (0.32, "0.32"),
         (0.329, "0.329"),

diff --git a/sgkit/tests/test_ld.py b/sgkit/tests/test_ld.py
@@ -1,12 +1,9 @@
 from typing import Optional
 
-import allel
-import dask.array as da
 import numpy as np
 import numpy.testing as npt
 import pytest
 from dask.dataframe import DataFrame
-from hypothesis import Phase, example, given, settings
 from hypothesis import strategies as st
 from hypothesis.extra.numpy import arrays
 
@@ -27,40 +24,27 @@ def test_rogers_huff_r_between():
     gnb = np.array([[0, 1, 2]])
     npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
     npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
-    npt.assert_allclose(
-        allel.rogers_huff_r_between(gna, gnb),
-        rogers_huff_r_between(gna[0], gnb[0]),
-        rtol=1e-06,
-    )
 
     gna = np.array([[0, 1, 2]])
     gnb = np.array([[2, 1, 0]])
     npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), -1.0, rtol=1e-06)
     npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
-    npt.assert_allclose(
-        allel.rogers_huff_r_between(gna, gnb),
-        rogers_huff_r_between(gna[0], gnb[0]),
-        rtol=1e-06,
-    )
 
     gna = np.array([[0, 0, 0]])
     gnb = np.array([[1, 1, 1]])
     assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
     assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
-    assert np.isnan(allel.rogers_huff_r_between(gna, gnb))
 
     gna = np.array([[1, 1, 1]])
     gnb = np.array([[1, 1, 1]])
     assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
     assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
-    assert np.isnan(allel.rogers_huff_r_between(gna, gnb))
 
     # a case which fails if fastmath=True is enabled for rogers_huff_r_between
     gna = np.full((1, 49), 2)
     gnb = np.full((1, 49), 2)
     assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
     assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
-    assert np.isnan(allel.rogers_huff_r_between(gna, gnb))
 
 
 def ldm_df(
@@ -115,7 +99,16 @@ def test_threshold():
 
 @pytest.mark.parametrize(
     "dtype",
-    [dtype for k, v in np.sctypes.items() for dtype in v if k in ["int", "uint"]],  # type: ignore
+    [
+        np.int8,
+        np.int16,
+        np.int32,
+        np.int64,
+        np.uint8,
+        np.uint16,
+        np.uint32,
+        np.uint64,
+    ],
 )
 def test_dtypes(dtype):
     # Input matrices should work regardless of integer type
@@ -148,37 +141,6 @@ def ld_prune_args(draw):
     return x, window, step, threshold, chunks
 
 
-# Phases setting without shrinking for complex, conditional draws in
-# which shrinking wastes time and adds little information
-# (see https://hypothesis.readthedocs.io/en/latest/settings.html#hypothesis.settings.phases)
-PHASES_NO_SHRINK = (Phase.explicit, Phase.reuse, Phase.generate, Phase.target)
-
-
-@given(args=ld_prune_args())  # pylint: disable=no-value-for-parameter
-@settings(max_examples=50, deadline=None, phases=PHASES_NO_SHRINK)
-@example(args=(np.array([[1, 1], [1, 1]], dtype="uint8"), 1, 1, 0.0, -1))
-@pytest.mark.skip(
-    reason="Hypothesis generates failures that need investigation: https://github.com/sgkit-dev/sgkit/issues/864"
-)
-def test_vs_skallel(args):
-    x, size, step, threshold, chunks = args
-
-    ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1])
-    ds["call_dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0: chunks}))
-    ds = window_by_variant(ds, size=size, step=step)
-
-    ldm = ld_matrix(ds, threshold=threshold)
-    has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any()
-    assert not has_duplicates
-    idx_drop_ds = maximal_independent_set(ldm)
-
-    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)
-    m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold)
-    idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1))
-
-    npt.assert_equal(idx_drop_ska, idx_drop)
-
-
 def test_scores():
     # Create zero row vectors except for 1st and 11th
     # (make them have non-zero variance)

diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
@@ -712,6 +712,9 @@ def test_hash_array(n_rows, n_cols):
     _, expected_inverse, expected_counts = np.unique(
         x, axis=0, return_inverse=True, return_counts=True
     )
+    # following is needed due to https://github.com/numpy/numpy/issues/26738
+    # (workaround from https://github.com/lmcinnes/umap/issues/1138)
+    expected_inverse = expected_inverse.reshape(-1)
 
     # hash columns, then find unique column counts using the hash values
     h = hash_array(x)

diff --git a/sgkit/utils.py b/sgkit/utils.py
@@ -362,6 +362,8 @@ def split_array_chunks(n: int, blocks: int) -> Tuple[int, ...]:
     if blocks <= 0:
         raise ValueError(f"Number of blocks ({blocks}) must be >= 0")
     n_div, n_mod = np.divmod(n, blocks)
+    n_div = int(n_div)
+    n_mod = int(n_mod)
     chunks = n_mod * (n_div + 1,) + (blocks - n_mod) * (n_div,)
     return chunks  # type: ignore[no-any-return]