Merge branch 'main' into gpuci-query-planning

rjzamora · May 2, 2024 · 83c8cb4 · 83c8cb4
2 parents 394a00b + 814ed3b
commit 83c8cb4
Show file tree

Hide file tree

Showing 31 changed files with 153 additions and 47 deletions.
diff --git a/.github/workflows/additional.yml b/.github/workflows/additional.yml
@@ -13,10 +13,10 @@ jobs:
     timeout-minutes: 90
     steps:
       - name: Checkout source
-        uses: actions/[email protected].3
+        uses: actions/[email protected].4
 
       - name: Setup Conda Environment
-        uses: conda-incubator/[email protected].3
+        uses: conda-incubator/[email protected].4
         with:
           miniforge-variant: Mambaforge
           miniforge-version: latest
@@ -44,10 +44,10 @@ jobs:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
       - name: Checkout source
-        uses: actions/[email protected].3
+        uses: actions/[email protected].4
 
       - name: Setup Conda
-        uses: conda-incubator/[email protected].3
+        uses: conda-incubator/[email protected].4
         with:
           miniforge-variant: Mambaforge
           miniforge-version: latest

diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml
@@ -28,11 +28,11 @@ jobs:
     name: Build (and upload)
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/[email protected].3
+      - uses: actions/[email protected].4
         with:
           fetch-depth: 0
       - name: Set up Python
-        uses: conda-incubator/[email protected].3
+        uses: conda-incubator/[email protected].4
         with:
           miniforge-variant: Mambaforge
           use-mamba: true

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -11,7 +11,7 @@ jobs:
     name: pre-commit hooks
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/[email protected].3
+      - uses: actions/[email protected].4
       - uses: actions/setup-python@v5
         with:
           python-version: '3.9'

diff --git a/.github/workflows/test-report.yaml b/.github/workflows/test-report.yaml
@@ -21,12 +21,12 @@ jobs:
       run:
         shell: bash -l {0}
     steps:
-      - uses: actions/[email protected].3
+      - uses: actions/[email protected].4
         with:
           repository: dask/distributed
 
       - name: Setup Conda Environment
-        uses: conda-incubator/[email protected].3
+        uses: conda-incubator/[email protected].4
         with:
           miniforge-variant: Mambaforge
           miniforge-version: latest

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -59,7 +59,7 @@ jobs:
 
     steps:
       - name: Checkout source
-        uses: actions/[email protected].3
+        uses: actions/[email protected].4
         with:
           fetch-depth: 0  # Needed by codecov.io
 
@@ -71,7 +71,7 @@ jobs:
           java-version: "11"
 
       - name: Setup Conda Environment
-        uses: conda-incubator/[email protected].3
+        uses: conda-incubator/[email protected].4
         with:
           miniforge-variant: Mambaforge
           miniforge-version: latest

diff --git a/.github/workflows/update-gpuci.yml b/.github/workflows/update-gpuci.yml
@@ -11,7 +11,7 @@ jobs:
     if: github.repository == 'dask/dask'
 
     steps:
-      - uses: actions/[email protected].3
+      - uses: actions/[email protected].4
 
       - name: Parse current axis YAML
         id: rapids_current

diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml
@@ -20,7 +20,7 @@ jobs:
     outputs:
       test-upstream: ${{ steps.detect-trigger.outputs.trigger-found }}
     steps:
-      - uses: actions/[email protected].3
+      - uses: actions/[email protected].4
         with:
           fetch-depth: 2
       - uses: xarray-contrib/ci-trigger@v1
@@ -47,10 +47,10 @@ jobs:
 
     steps:
       - name: Checkout source
-        uses: actions/[email protected].3
+        uses: actions/[email protected].4
 
       - name: Setup Conda Environment
-        uses: conda-incubator/[email protected].3
+        uses: conda-incubator/[email protected].4
         with:
           miniforge-variant: Mambaforge
           miniforge-version: latest

diff --git a/dask/array/tests/test_array_core.py b/dask/array/tests/test_array_core.py
@@ -430,8 +430,8 @@ def test_stack_rechunk():
 
 
 def test_stack_unknown_chunksizes():
-    dd = pytest.importorskip("dask.dataframe")
     pd = pytest.importorskip("pandas")
+    dd = pytest.importorskip("dask.dataframe")
 
     a_df = pd.DataFrame({"x": np.arange(12)})
     b_df = pd.DataFrame({"y": np.arange(12) * 10})
@@ -546,8 +546,8 @@ def test_concatenate_types(dtypes):
 
 
 def test_concatenate_unknown_axes():
-    dd = pytest.importorskip("dask.dataframe")
     pd = pytest.importorskip("pandas")
+    dd = pytest.importorskip("dask.dataframe")
 
     a_df = pd.DataFrame({"x": np.arange(12)})
     b_df = pd.DataFrame({"y": np.arange(12) * 10})
@@ -2211,6 +2211,7 @@ def test_to_hdf5():
 
 
 def test_to_dask_dataframe():
+    pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
     a = da.ones((4,), chunks=(2,))
     d = a.to_dask_dataframe()
@@ -2770,8 +2771,8 @@ def test_asarray(asarray):
 @pytest.mark.parametrize("asarray", [da.asarray, da.asanyarray])
 def test_asarray_dask_dataframe(asarray):
     # https://github.com/dask/dask/issues/3885
+    pd = pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
-    import pandas as pd
 
     s = dd.from_pandas(pd.Series([1, 2, 3, 4]), 2)
     result = asarray(s)

diff --git a/dask/array/tests/test_rechunk.py b/dask/array/tests/test_rechunk.py
@@ -286,6 +286,7 @@ def test_rechunk_same():
 
 
 def test_rechunk_same_fully_unknown():
+    pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
     x = da.ones(shape=(10, 10), chunks=(5, 10))
     y = dd.from_array(x).values
@@ -299,6 +300,7 @@ def test_rechunk_same_fully_unknown_floats():
     """Similar to test_rechunk_same_fully_unknown but testing the behavior if
     ``float("nan")`` is used instead of the recommended ``np.nan``
     """
+    pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
     x = da.ones(shape=(10, 10), chunks=(5, 10))
     y = dd.from_array(x).values
@@ -308,6 +310,7 @@ def test_rechunk_same_fully_unknown_floats():
 
 
 def test_rechunk_same_partially_unknown():
+    pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
     x = da.ones(shape=(10, 10), chunks=(5, 10))
     y = dd.from_array(x).values
@@ -592,8 +595,8 @@ def test_intersect_nan_long():
 
 
 def test_rechunk_unknown_from_pandas():
-    dd = pytest.importorskip("dask.dataframe")
     pd = pytest.importorskip("pandas")
+    dd = pytest.importorskip("dask.dataframe")
 
     arr = np.random.default_rng().standard_normal((50, 10))
     x = dd.from_pandas(pd.DataFrame(arr), 2).values
@@ -606,6 +609,7 @@ def test_rechunk_unknown_from_pandas():
 
 
 def test_rechunk_unknown_from_array():
+    pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
     # pd = pytest.importorskip('pandas')
     x = dd.from_array(da.ones(shape=(4, 4), chunks=(2, 2))).values
@@ -635,6 +639,7 @@ def test_rechunk_unknown_from_array():
     ],
 )
 def test_rechunk_with_fully_unknown_dimension(x, chunks):
+    pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
     y = dd.from_array(x).values
     result = y.rechunk(chunks)
@@ -661,6 +666,7 @@ def test_rechunk_with_fully_unknown_dimension(x, chunks):
     ],
 )
 def test_rechunk_with_partially_unknown_dimension(x, chunks):
+    pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
     y = dd.from_array(x).values
     z = da.concatenate([x, y])
@@ -680,6 +686,7 @@ def test_rechunk_with_partially_unknown_dimension(x, chunks):
     ],
 )
 def test_rechunk_with_fully_unknown_dimension_explicit(new_chunks):
+    pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
     x = da.ones(shape=(10, 10), chunks=(5, 2))
     y = dd.from_array(x).values
@@ -698,6 +705,7 @@ def test_rechunk_with_fully_unknown_dimension_explicit(new_chunks):
     ],
 )
 def test_rechunk_with_partially_unknown_dimension_explicit(new_chunks):
+    pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
     x = da.ones(shape=(10, 10), chunks=(5, 2))
     y = dd.from_array(x).values
@@ -715,6 +723,7 @@ def assert_chunks_match(left, right):
 
 
 def test_rechunk_unknown_raises():
+    pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
 
     x = da.ones(shape=(10, 10), chunks=(5, 5))

diff --git a/dask/bag/tests/test_bag.py b/dask/bag/tests/test_bag.py
@@ -775,6 +775,7 @@ def test_from_long_sequence():
 
 
 def test_from_empty_sequence():
+    pytest.importorskip("pandas")
     pytest.importorskip("dask.dataframe")
     b = db.from_sequence([])
     assert b.npartitions == 1
@@ -878,8 +879,8 @@ def test_args():
 
 
 def test_to_dataframe():
-    dd = pytest.importorskip("dask.dataframe")
     pd = pytest.importorskip("pandas")
+    dd = pytest.importorskip("dask.dataframe")
 
     def check_parts(df, sol):
         assert all(
@@ -1608,6 +1609,7 @@ def f_drop(o):
 
 
 def test_bagged_array_delayed():
+    pytest.importorskip("numpy")
     da = pytest.importorskip("dask.array")
 
     obj = da.ones(10, chunks=5).to_delayed()[0]
@@ -1631,6 +1633,7 @@ def test_dask_layers():
 def test_dask_layers_to_delayed(optimize):
     # `da.Array.to_delayed` causes the layer name to not match the key.
     # Ensure the layer name is propagated between `Delayed` and `Item`.
+    pytest.importorskip("numpy")
     da = pytest.importorskip("dask.array")
     i = db.Item.from_delayed(da.ones(1).to_delayed()[0])
     name = i.key[0]
@@ -1660,6 +1663,7 @@ def test_dask_layers_to_delayed(optimize):
 
 
 def test_to_dataframe_optimize_graph():
+    pytest.importorskip("pandas")
     dd = pytest.importorskip("dask.dataframe")
 
     from dask.dataframe.utils import assert_eq as assert_eq_df

diff --git a/dask/dataframe/io/tests/test_io.py b/dask/dataframe/io/tests/test_io.py
@@ -844,15 +844,17 @@ def test_to_delayed_optimize_graph():
     d = ddf2.to_delayed()[0]
     assert len(d.dask) < 20
     d2 = ddf2.to_delayed(optimize_graph=False)[0]
-    assert sorted(d2.dask) == sorted(ddf2.dask)
+    if not dd._dask_expr_enabled():
+        assert sorted(d2.dask) == sorted(ddf2.dask)
     assert_eq(ddf2.get_partition(0), d.compute())
     assert_eq(ddf2.get_partition(0), d2.compute())
 
     # Scalar
     x = ddf2.x.sum()
     dx = x.to_delayed()
     dx2 = x.to_delayed(optimize_graph=False)
-    assert len(dx.dask) < len(dx2.dask)
+    if not dd._dask_expr_enabled():
+        assert len(dx.dask) < len(dx2.dask)
     assert_eq(dx.compute(), dx2.compute())
 
 

diff --git a/dask/dataframe/io/tests/test_orc.py b/dask/dataframe/io/tests/test_orc.py
@@ -94,6 +94,7 @@ def test_orc_roundtrip(tmpdir, index, columns):
             ),
         }
     )
+    data.iloc[0, 0] = 100
     if index:
         data = data.set_index(index)
     df = dd.from_pandas(data, chunksize=500)
@@ -105,7 +106,7 @@ def test_orc_roundtrip(tmpdir, index, columns):
 
     # Read
     df2 = dd.read_orc(tmp, index=index, columns=columns)
-    assert_eq(data, df2, check_index=bool(index))
+    assert_eq(data, df2, check_index=False)
 
 
 @pytest.mark.parametrize("split_stripes", [True, False, 2, 4])

diff --git a/dask/dataframe/io/tests/test_parquet.py b/dask/dataframe/io/tests/test_parquet.py
@@ -20,12 +20,7 @@
 import dask.multiprocessing
 from dask.array.numpy_compat import NUMPY_GE_124
 from dask.blockwise import Blockwise, optimize_blockwise
-from dask.dataframe._compat import (
-    PANDAS_GE_150,
-    PANDAS_GE_200,
-    PANDAS_GE_202,
-    PANDAS_GE_300,
-)
+from dask.dataframe._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_202
 from dask.dataframe.io.parquet.core import get_engine
 from dask.dataframe.io.parquet.utils import _parse_pandas_metadata
 from dask.dataframe.optimize import optimize_dataframe_getitem
@@ -480,7 +475,6 @@ def test_calculate_divisions_no_index(tmpdir, write_engine, read_engine):
     assert not df.known_divisions
 
 
-@pytest.mark.xfail(PANDAS_GE_300, reason="KeyError")
 def test_columns_index_with_multi_index(tmpdir, engine):
     fn = os.path.join(str(tmpdir), "test.parquet")
     index = pd.MultiIndex.from_arrays(

diff --git a/dask/dataframe/io/tests/test_sql.py b/dask/dataframe/io/tests/test_sql.py
@@ -6,6 +6,7 @@
 
 import pytest
 
+from dask.dataframe._compat import PANDAS_GE_300
 from dask.dataframe.io.sql import read_sql, read_sql_query, read_sql_table
 from dask.dataframe.utils import assert_eq, get_string_dtype
 from dask.utils import tmpfile
@@ -276,6 +277,7 @@ def test_divisions(db):
     assert_eq(data, df[["name"]][df.index <= 4])
 
 
+@pytest.mark.xfail(PANDAS_GE_300, reason="memory doesn't match")
 def test_division_or_partition(db):
     with pytest.raises(TypeError, match="either 'divisions' or 'npartitions'"):
         read_sql_table(

diff --git a/dask/dataframe/tests/test_dataframe.py b/dask/dataframe/tests/test_dataframe.py
@@ -4613,7 +4613,9 @@ def test_idxmaxmin_empty_partitions():
     )
 
     if PANDAS_GE_300:
-        ctx = pytest.raises(ValueError, match="Encountered all NA values")
+        ctx = pytest.raises(
+            ValueError, match="Encountered all NA values|Encountered an NA value with"
+        )
     elif PANDAS_GE_210:
         ctx = pytest.warns(FutureWarning, match="all-NA values")
     else:

diff --git a/dask/diagnostics/tests/test_progress.py b/dask/diagnostics/tests/test_progress.py
@@ -22,6 +22,7 @@ def check_bar_completed(capsys, width=40):
 
 
 def test_array_compute(capsys):
+    pytest.importorskip("numpy")
     da = pytest.importorskip("dask.array")
 
     data = da.ones((100, 100), dtype="f4", chunks=(100, 100))