Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support empty arrays, improve ibis.array() API #9473

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion ibis/backends/pandas/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ def visit(cls, op: ops.Node, **kwargs):

@classmethod
def visit(cls, op: ops.Literal, value, dtype):
if dtype.is_interval():
if value is None:
value = None
elif dtype.is_interval():
value = pd.Timedelta(value, dtype.unit.short)
elif dtype.is_array():
value = np.array(value)
Expand Down Expand Up @@ -219,6 +221,11 @@ def visit(cls, op: ops.FindInSet, needle, values):
result = np.select(condlist, choicelist, default=-1)
return pd.Series(result, name=op.name)

@classmethod
def visit(cls, op: ops.EmptyArray, dtype):
pdt = PandasType.from_ibis(dtype)
return np.array([], dtype=pdt)

@classmethod
def visit(cls, op: ops.Array, exprs):
return cls.rowwise(lambda row: np.array(row, dtype=object), exprs)
Expand Down
15 changes: 8 additions & 7 deletions ibis/backends/polars/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,7 @@ def literal(op, **_):
value = op.value
dtype = op.dtype

if dtype.is_array():
value = pl.Series("", value)
typ = PolarsType.from_ibis(dtype)
val = pl.lit(value, dtype=typ)
return val.implode()
elif dtype.is_struct():
if dtype.is_struct():
values = [
pl.lit(v, dtype=PolarsType.from_ibis(dtype[k])).alias(k)
for k, v in value.items()
Expand All @@ -106,7 +101,7 @@ def literal(op, **_):
return pl.lit(value)
else:
typ = PolarsType.from_ibis(dtype)
return pl.lit(op.value, dtype=typ)
return pl.lit(value, dtype=typ)


_TIMESTAMP_SCALE_TO_UNITS = {
Expand Down Expand Up @@ -973,6 +968,12 @@ def array_concat(op, **kw):
return result


@translate.register(ops.EmptyArray)
def empty_array(op, **kw):
pdt = PolarsType.from_ibis(op.dtype)
return pl.lit([], dtype=pdt)


@translate.register(ops.Array)
def array_column(op, **kw):
cols = [translate(col, **kw) for col in op.exprs]
Expand Down
3 changes: 3 additions & 0 deletions ibis/backends/sql/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,9 @@ def visit_InSubquery(self, op, *, rel, needle):
query = sg.select(STAR).from_(query)
return needle.isin(query=query)

def visit_EmptyArray(self, op, *, dtype):
return self.cast(self.f.array(), dtype)

def visit_Array(self, op, *, exprs):
return self.f.array(*exprs)

Expand Down
68 changes: 67 additions & 1 deletion ibis/backends/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
PySparkAnalysisException,
TrinoUserError,
)
from ibis.common.annotations import ValidationError
from ibis.common.collections import frozendict

pytestmark = [
Expand Down Expand Up @@ -72,6 +73,71 @@
# list.


def test_array_factory(con):
a = ibis.array([1, 2, 3])
assert a.type() == dt.Array(value_type=dt.Int8)
assert con.execute(a) == [1, 2, 3]

a2 = ibis.array(a)
assert a.type() == dt.Array(value_type=dt.Int8)
assert con.execute(a2) == [1, 2, 3]


def test_array_factory_typed(con):
typed = ibis.array([1, 2, 3], type="array<string>")
assert con.execute(typed) == ["1", "2", "3"]

typed2 = ibis.array(ibis.array([1, 2, 3]), type="array<string>")
assert con.execute(typed2) == ["1", "2", "3"]


@pytest.mark.notimpl("flink", raises=Py4JJavaError)
def test_array_factory_empty(con):
with pytest.raises(ValidationError):
ibis.array([])

empty_typed = ibis.array([], type="array<string>")
assert empty_typed.type() == dt.Array(value_type=dt.string)
assert con.execute(empty_typed) == []


@pytest.mark.notyet(
"clickhouse", raises=ClickHouseDatabaseError, reason="nested types can't be NULL"
)
def test_array_factory_null(con):
with pytest.raises(ValidationError):
ibis.array(None)
with pytest.raises(ValidationError):
ibis.array(None, type="int64")
none_typed = ibis.array(None, type="array<string>")
assert none_typed.type() == dt.Array(value_type=dt.string)
assert con.execute(none_typed) is None

nones = ibis.array([None, None], type="array<string>")
assert nones.type() == dt.Array(value_type=dt.string)
assert con.execute(nones) == [None, None]

# Execute a real value here, so the backends that don't support arrays
# actually xfail as we expect them to.
# Otherwise would have to @mark.xfail every test in this file besides this one.
assert con.execute(ibis.array([1, 2])) == [1, 2]


@pytest.mark.broken(
["datafusion", "flink", "polars"],
raises=AssertionError,
reason="[None, 1] executes to [np.nan, 1.0]",
)
def test_array_factory_null_mixed(con):
none_and_val = ibis.array([None, 1])
assert none_and_val.type() == dt.Array(value_type=dt.Int8)
assert con.execute(none_and_val) == [None, 1]

none_and_val_typed = ibis.array([None, 1], type="array<string>")
assert none_and_val_typed.type() == dt.Array(value_type=dt.String)
assert con.execute(none_and_val_typed) == [None, "1"]


def test_array_column(backend, alltypes, df):
expr = ibis.array(
[alltypes["double_col"], alltypes["double_col"], 5.0, ibis.literal(6.0)]
Expand Down Expand Up @@ -1356,7 +1422,7 @@ def test_unnest_range(con):
pytest.mark.notyet(["bigquery"], raises=GoogleBadRequest),
pytest.mark.broken(
["polars"],
reason="expression input not supported with nested arrays",
reason="upstream polars bug: https://github.com/pola-rs/polars/issues/17294",
raises=TypeError,
),
],
Expand Down
7 changes: 3 additions & 4 deletions ibis/backends/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,13 +1431,12 @@ def query(t, group_cols):
snapshot.assert_match(str(ibis.to_sql(t3, dialect=con.name)), "out.sql")


@pytest.mark.notimpl(["oracle", "exasol"], raises=com.OperationNotDefinedError)
@pytest.mark.notimpl(["druid"], raises=AssertionError)
@pytest.mark.notyet(
["datafusion", "impala", "mssql", "mysql", "sqlite"],
["datafusion", "exasol", "impala", "mssql", "mysql", "oracle", "sqlite"],
reason="backend doesn't support arrays and we don't implement pivot_longer with unions yet",
raises=com.OperationNotDefinedError,
raises=(com.OperationNotDefinedError, com.UnsupportedBackendType),
)
@pytest.mark.notimpl(["druid"], raises=AssertionError)
@pytest.mark.broken(
["trino"],
reason="invalid code generated for unnesting a struct",
Expand Down
16 changes: 13 additions & 3 deletions ibis/backends/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,18 @@
ibis.struct(dict(abc=432)),
marks=[
pytest.mark.never(
["impala", "mysql", "sqlite", "mssql", "exasol"],
raises=(NotImplementedError, exc.UnsupportedBackendType),
[
"exasol",
"impala",
"mysql",
"sqlite",
"mssql",
],
raises=(
exc.OperationNotDefinedError,
NotImplementedError,
exc.UnsupportedBackendType,
),
reason="structs not supported in the backend",
),
pytest.mark.notimpl(
Expand Down Expand Up @@ -104,7 +114,7 @@ def test_isin_bug(con, snapshot):
@pytest.mark.notyet(
["datafusion", "exasol", "oracle", "flink", "risingwave"],
reason="no unnest support",
raises=exc.OperationNotDefinedError,
raises=(exc.OperationNotDefinedError, exc.UnsupportedBackendType),
)
@pytest.mark.notyet(
["sqlite", "mysql", "druid", "impala", "mssql"], reason="no unnest support upstream"
Expand Down
10 changes: 5 additions & 5 deletions ibis/backends/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,18 +835,18 @@ def test_capitalize(con, inp, expected):
assert pd.isnull(result)


@pytest.mark.never(
["exasol", "impala", "mssql", "mysql", "sqlite"],
reason="Backend doesn't support arrays",
raises=(com.OperationNotDefinedError, com.UnsupportedBackendType),
)
@pytest.mark.notimpl(
[
"dask",
"pandas",
"polars",
"oracle",
"flink",
"sqlite",
"mssql",
"mysql",
"exasol",
"impala",
],
raises=com.OperationNotDefinedError,
)
Expand Down
19 changes: 13 additions & 6 deletions ibis/expr/operations/arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,34 @@

from __future__ import annotations

from typing import Optional
from typing import Annotated, Optional

from public import public

import ibis.expr.datashape as ds
import ibis.expr.datatypes as dt
import ibis.expr.rules as rlz
from ibis.common.annotations import attribute
from ibis.common.patterns import Length # noqa: TCH001
from ibis.common.typing import VarTuple # noqa: TCH001
from ibis.expr.operations.core import Unary, Value


@public
class EmptyArray(Value):
"""Construct an array with 0 elements."""

dtype: dt.Array
shape = ds.scalar


@public
class Array(Value):
"""Construct an array."""
"""Construct an array with 1+ elements. Use `EmptyArray` for empty arrays."""

exprs: VarTuple[Value]
exprs: Annotated[VarTuple[Value], Length(at_least=1)]

@attribute
def shape(self):
return rlz.highest_precedence_shape(self.exprs)
shape = rlz.shape_like("exprs")

@attribute
def dtype(self):
Expand Down
62 changes: 52 additions & 10 deletions ibis/expr/types/arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@

from public import public

import ibis
import ibis.expr.datatypes as dt
import ibis.expr.operations as ops
import ibis.expr.types as ir
from ibis.common.annotations import ValidationError
from ibis.common.deferred import Deferred, deferrable
from ibis.expr.types.generic import Column, Scalar, Value

if TYPE_CHECKING:
from collections.abc import Callable, Iterable

import ibis.expr.types as ir
from ibis.expr.types.typing import V

import ibis.common.exceptions as com


Expand Down Expand Up @@ -1067,7 +1068,11 @@

@public
@deferrable
def array(values: Iterable[V]) -> ArrayValue:
def array(
values: ArrayValue | Iterable | ir.NullValue | None,
*,
type: str | dt.DataType | None = None,
) -> ArrayValue:
"""Create an array expression.

If any values are [column expressions](../concepts/datatypes.qmd) the
Expand All @@ -1078,6 +1083,9 @@
----------
values
An iterable of Ibis expressions or Python literals
type
An instance of `ibis.expr.datatypes.DataType` or a string indicating
the Ibis type of `value`. eg `array<float>`.

Returns
-------
Expand Down Expand Up @@ -1108,15 +1116,49 @@
│ [3, 42, ... +1] │
└──────────────────────┘

>>> ibis.array([t.a, 42 + ibis.literal(5)])
>>> ibis.array([t.a, 42 + ibis.literal(5)], type="array<float>")
┏━━━━━━━━━━━━━━━━━━━━━━┓
┃ Array() ┃
┡━━━━━━━━━━━━━━━━━━━━━━┩
│ array<int64>
│ array<float64>
├──────────────────────┤
│ [1, 47]
│ [2, 47]
│ [3, 47]
│ [1.0, 47.0]
│ [2.0, 47.0]
│ [3.0, 47.0]
└──────────────────────┘
"""
return ops.Array(tuple(values)).to_expr()
type = dt.dtype(type) if type is not None else None
if type is not None and not isinstance(type, dt.Array):
raise ValidationError(f"type must be an array, got {type}")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just let this fall through and bubble up as it would? This function is already getting stuffed with a bunch of branching, and this doesn't seem to add much value.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guarding against the footgun of someone passing in "float" instead of "array" (I literally accidentally did this as I adjusted the tests this last round), which would silently give the wrong result for some inputs. I agree not needed, I can remove, just a nicety


if isinstance(values, ir.Value):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we're going to accept inputs like this. Can you get rid of this code path?

if type is not None:
return values.cast(type)
elif isinstance(values, ArrayValue):
return values
else:
raise ValidationError(

Check warning on line 1140 in ibis/expr/types/arrays.py

View check run for this annotation

Codecov / codecov/patch

ibis/expr/types/arrays.py#L1140

Added line #L1140 was not covered by tests
f"If no type passed, values must be an array, got {values.type()}"
)

if values is None:
if type is None:
raise ValidationError("If values is None/NULL, type must be provided")
return ir.null(type)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like this. It's unnecessary to have to support another way to construct a null value.

If someone wants a NULL with an array type, they should use the null function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a thought here but I better do it at my computer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See these tests in mismo:

I need to construct an array<int> with a variety of values:

  • NULL
  • []
  • [1,2,3]

There are three different ibis APIs for doing this:

  • ibis.null(type="array<int>"): this only works for the NULL case. I would need some if/else branching in my test setup to use this for the NULL case, but would need something else for the other cases.
  • ibis.literal(val, type="array<int>"). This is currently what I do, and it works for all 3 inputs here, no if/else branching required. But the docstring for literal() says that it is going to get deprecated soon for constructing complex types, and I should use array()/map()/struct() instead.
  • ibis.array(): If we keep the functionality the way I propose here, then I can do ibis.array(val, type="array<int>") and it works for all three input cases. If we don't allow passing in None, then I would need to do some if/else branching to use ibis.null() in that case.

I think it can be summarized as SOMEONE is gonna need to do this if values is None: branching, and I would rather have it be down here in one place in library code, than in many places in user code.


values = tuple(values)
if len(values) == 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use standard Python idioms for this.

Suggested change
if len(values) == 0:
if not values:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops sorry will do.

if type is None:
raise ValidationError("If values is empty, type must be provided")
return ops.EmptyArray(type).to_expr()
else:
value_type = type.value_type if type is not None else None
values = [_value(v, value_type) for v in values]
return ops.Array(values).to_expr()


def _value(x, type) -> ir.Value:
if isinstance(x, (ir.Value, Deferred)):
return x.cast(type) if type is not None else x
else:
return ibis.literal(x, type=type)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the purpose of literal here?

Copy link
Contributor Author

@NickCrews NickCrews Jun 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you do ibis.array([1, 2], type="array<float>"), we somehow need to get those ints to floats. My original implementation stored them in the array as ints, and convert to float during compilation. But that required storing the eventual type in ops.Array. so here we pre convert all the elements to floats, and now ops.Array can just use rlz.highest_precedence_dtype(). In order to do that pre casting, we need this literal.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still not sure why this is required. Is there some API that fails to execute if you don't do these casts?

What if the elements themselves are arrays? Doesn't this then need to be recursive in the casting/call to _value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shoot, github escaped the <> I posted, I meant ibis.array([1, 2], type="array<float>") not ibis.array([1, 2], type="array")

Now does the concern make sense? Two ways of storing this:

  1. store the inputs as is, and keep track of the dtype separately op.Array(values=[1,2], value_type="float")
  2. cast the inputs early, then we don't need to store the dtype: op.Array(values=[op.Literal(1, "float"), op.Literal(2, "float")])

I am trying to go for option 2.

I'm still not sure why this is required

Specificaly, if I remove the .literal() here, then we pass a plain python 1 into the op.Array constructor, and then using pattern matching it tries to coerce that to an ibis expression, which yields a op.Literal(1, int). But we wanted it to be a float, so we have to do the casting up front here.

What if the elements themselves are arrays

you mean what if x were a python list of ibis values? ie someone did array_of_arrays = ibis.array([[ibis.literal(1)], [ibis.literal(2)]])? Yes, this would fail, good catch. not sure what to do about it yet....