Skip to content

Commit

Permalink
align and simplify percentiles_summary logic
Browse files Browse the repository at this point in the history
  • Loading branch information
rjzamora committed May 3, 2024
1 parent efb4a62 commit 4f29798
Showing 1 changed file with 19 additions and 8 deletions.
27 changes: 19 additions & 8 deletions dask/dataframe/partitionquantiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,10 @@
from tlz import merge, merge_sorted, take

from dask.base import tokenize
from dask.dataframe._compat import PANDAS_GE_150
from dask.dataframe.core import Series
from dask.dataframe.dispatch import tolist_dispatch
from dask.dataframe.utils import is_series_like
from dask.utils import is_cupy_type, random_state_data


Expand Down Expand Up @@ -430,16 +432,13 @@ def percentiles_summary(df, num_old, num_new, upsample, state):
elif is_datetime64_dtype(data.dtype) or is_integer_dtype(data.dtype):
interpolation = "nearest"

# FIXME: pandas quantile doesn't work with some data types (e.g. strings).
# We fall back to an ndarray as a workaround.
try:
vals = data.quantile(q=qs / 100, interpolation=interpolation).values
# Plan A: Use `Series.quantile`
vals = data.quantile(q=qs / 100, interpolation=interpolation)
except (TypeError, NotImplementedError):
try:
vals, _ = _percentile(array_safe(data, like=data.values), qs, interpolation)
except (TypeError, NotImplementedError):
# `data.values` doesn't work for cudf, so we need to
# use `quantile(..., method="table")` as a fallback
# Series.quantile doesn't work with some data types (e.g. strings)
if PANDAS_GE_150:
# Fall back to DataFrame.quantile with "nearest" interpolation
interpolation = "nearest"
vals = (
data.to_frame()
Expand All @@ -451,6 +450,18 @@ def percentiles_summary(df, num_old, num_new, upsample, state):
)
.iloc[:, 0]
)
else:
# Fall back to ndarray (Not supported for cuDF)
vals, _ = _percentile(array_safe(data, like=data.values), qs, interpolation)

# Convert to array if necessary (and possible)
if is_series_like(vals):
try:
vals = vals.values
except (ValueError, TypeError):
# cudf->cupy won't work if nulls are present,
# or if this is a string dtype
pass

if (
is_cupy_type(data)
Expand Down

0 comments on commit 4f29798

Please sign in to comment.