Skip to content

Commit

Permalink
generalize
Browse files Browse the repository at this point in the history
  • Loading branch information
rjzamora committed Oct 9, 2023
1 parent 69ab999 commit 32113b5
Showing 1 changed file with 18 additions and 10 deletions.
28 changes: 18 additions & 10 deletions dask/dataframe/partitionquantiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
from tlz import merge, merge_sorted, take

from dask.base import tokenize
from dask.dataframe._compat import PANDAS_GE_150
from dask.dataframe.core import Series
from dask.dataframe.dispatch import tolist_dispatch
from dask.utils import is_cupy_type, random_state_data
Expand Down Expand Up @@ -413,6 +414,9 @@ def percentiles_summary(df, num_old, num_new, upsample, state):
Scale factor to increase the number of percentiles calculated in
each partition. Use to improve accuracy.
"""
from dask.array.dispatch import percentile_lookup as _percentile
from dask.array.utils import array_safe

length = len(df)
if length == 0:
return ()
Expand All @@ -432,17 +436,21 @@ def percentiles_summary(df, num_old, num_new, upsample, state):
try:
vals = data.quantile(q=qs / 100, interpolation=interpolation).values
except (TypeError, NotImplementedError):
interpolation = "nearest"
vals = (
data.to_frame()
.quantile(
q=qs / 100,
interpolation=interpolation,
numeric_only=False,
method="table",
if PANDAS_GE_150:
# NOTE: Required when data is a string column in cudf
interpolation = "nearest"
vals = (
data.to_frame()
.quantile(
q=qs / 100,
interpolation=interpolation,
numeric_only=False,
method="table",
)
.iloc[:, 0]
)
.iloc[:, 0]
)
else:
vals, _ = _percentile(array_safe(data, like=data.values), qs, interpolation)

if (
is_cupy_type(data)
Expand Down

0 comments on commit 32113b5

Please sign in to comment.