Skip to content

Commit

Permalink
feat: support type kwarg in array() and map()
Browse files Browse the repository at this point in the history
fixes ibis-project#8289

This does a lot of changes. It was hard for me to separate them out as I implemented them. But now that it's all hashed out, I can try to split this up into separate commits if you want. But that might be sorta hard in
some cases.

Several of the backends were always broken here, they just weren't getting caught. I marked them as broken,
we can fix them in a followup.

You can test this locally with eg
`pytest -m duckdb -k factory ibis/backends/tests/test_array.py  ibis/backends/tests/test_map.py ibis/backends/tests/test_struct.py`

Also, fix a typing bug: map() can accept ArrayValues, not just ArrayColumns

Also, fix executing NULL arrays on pandas.

Also, fix casting structs on pandas.
See ibis-project#8687

Also, support passing in None.

Also, error when the value type can't be inferred from empty python literals
(eg what is the value type for the elements of []?)

Also, make the type argument for struct() always have an effect, not just when passing in python literals.
So basically it can act like a cast.

Also, make these constructors idempotent.
  • Loading branch information
NickCrews committed Mar 18, 2024
1 parent 220085e commit b83016c
Show file tree
Hide file tree
Showing 8 changed files with 236 additions and 49 deletions.
8 changes: 8 additions & 0 deletions ibis/backends/pandas/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,11 @@ def convert_String(cls, s, dtype, pandas_type):
# TODO(kszucs): should switch to the new pandas string type and convert
# object columns using s.convert_dtypes() method
return s.map(str, na_action="ignore").astype(object)

@classmethod
def convert_Struct(cls, s, dtype, pandas_type):
# dt.normalize returns a frozendict, which doesn't play well
# with the pandas executor
# https://github.com/ibis-project/ibis/issues/8687
func = lambda x: x if x is pd.NA else dict(dt.normalize(dtype, x))
return s.map(func, na_action="ignore").astype(pandas_type)
34 changes: 34 additions & 0 deletions ibis/backends/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,40 @@
# list.


def test_array_factory(con):
a = ibis.array([1, 2, 3])
assert con.execute(a) == [1, 2, 3]
a2 = ibis.array(a)
assert con.execute(a2) == [1, 2, 3]
typed = ibis.array([1, 2, 3], type="array<float64>")
assert con.execute(typed) == [1.0, 2.0, 3.0]
typed2 = ibis.array(a, type="array<float64>")
assert con.execute(typed2) == [1.0, 2.0, 3.0]


@pytest.mark.notimpl("postgres", raises=PsycoPg2IndeterminateDatatype)
def test_array_factory_empty(con):
with pytest.raises(TypeError):
ibis.array([])

empty_typed = ibis.array([], type="array<float64>")
assert str(empty_typed.type()) == "array<float64>"
assert con.execute(empty_typed) == []


@pytest.mark.broken("polars", raises=AssertionError)
@pytest.mark.broken("pandas", raises=TypeError)
@pytest.mark.notyet(
"clickhouse", raises=ClickHouseDatabaseError, reason="nested types can't be NULL"
)
def test_array_factory_null(con):
with pytest.raises(TypeError):
ibis.array(None)
none_typed = ibis.array(None, type="array<float64>")
assert str(none_typed.type()) == "array<float64>"
assert con.execute(none_typed) is None


def test_array_column(backend, alltypes, df):
expr = ibis.array(
[alltypes["double_col"], alltypes["double_col"], 5.0, ibis.literal(6.0)]
Expand Down
48 changes: 47 additions & 1 deletion ibis/backends/tests/test_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
import ibis
import ibis.common.exceptions as exc
import ibis.expr.datatypes as dt
from ibis.backends.tests.errors import PsycoPg2InternalError, Py4JJavaError
from ibis.backends.tests.errors import (
ClickHouseDatabaseError,
PsycoPg2InternalError,
Py4JJavaError,
)

pytestmark = [
pytest.mark.never(
Expand All @@ -26,6 +30,48 @@
]


@pytest.mark.notimpl(
["risingwave"],
raises=PsycoPg2InternalError,
reason="function hstore(character varying[], character varying[]) does not exist",
)
@pytest.mark.notyet("postgres", reason="only support maps of string -> string")
def test_map_factory(con):
m = ibis.map({"a": 1, "b": 2})
assert con.execute(m) == {"a": 1, "b": 2}
m2 = ibis.map(m)
assert con.execute(m2) == {"a": 1, "b": 2}
typed = ibis.map({"a": 1, "b": 2}, type="map<string, float>")
assert con.execute(typed) == {"a": 1.0, "b": 2.0}
typed2 = ibis.map(m, type="map<string, float>")
assert con.execute(typed2) == {"a": 1.0, "b": 2.0}


@pytest.mark.notimpl(
["risingwave"],
raises=PsycoPg2InternalError,
reason="function hstore(character varying[], character varying[]) does not exist",
)
@pytest.mark.notyet("postgres", reason="only support maps of string -> string")
def test_map_factory_empty(con):
with pytest.raises(TypeError):
ibis.map({})
empty_typed = ibis.map({}, type="map<string, string>")
assert str(empty_typed.type()) == "map<string, string>"
assert con.execute(empty_typed) == {}


@pytest.mark.notyet(
"clickhouse", raises=ClickHouseDatabaseError, reason="nested types can't be NULL"
)
def test_map_factory_null(con):
with pytest.raises(TypeError):
ibis.map(None)
null_typed = ibis.map(None, type="map<string, string>")
assert str(null_typed.type()) == "map<string, string>"
assert con.execute(null_typed) is None


@pytest.mark.notimpl(["pandas", "dask"])
def test_map_table(backend):
table = backend.map
Expand Down
33 changes: 33 additions & 0 deletions ibis/backends/tests/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import ibis
import ibis.expr.datatypes as dt
from ibis.backends.tests.errors import (
ClickHouseDatabaseError,
PsycoPg2InternalError,
PsycoPg2SyntaxError,
Py4JJavaError,
Expand All @@ -23,6 +24,38 @@
]


@pytest.mark.notimpl(["postgres"])
# @pytest.mark.broken(["pandas", "dask"], reason="casting is broken")
def test_struct_factory(con):
s = ibis.struct({"a": 1, "b": 2})
assert con.execute(s) == {"a": 1, "b": 2}
s2 = ibis.struct(s)
assert con.execute(s2) == {"a": 1, "b": 2}
typed = ibis.struct({"a": 1, "b": 2}, type="struct<a: float64, b: float64>")
assert con.execute(typed) == {"a": 1.0, "b": 2.0}
typed2 = ibis.struct(s, type="struct<a: float64, b: float64>")
assert con.execute(typed2) == {"a": 1.0, "b": 2.0}


def test_struct_factory_empty(con):
with pytest.raises(TypeError):
ibis.struct({})
with pytest.raises(TypeError):
ibis.struct({}, type="struct<a: float64, b: float64>")


@pytest.mark.broken("polars", raises=AttributeError)
@pytest.mark.notyet(
"clickhouse", raises=ClickHouseDatabaseError, reason="nested types can't be NULL"
)
def test_struct_factory_null(con):
with pytest.raises(TypeError):
ibis.struct(None)
none_typed = ibis.struct(None, type="struct<a: float64, b: float>")
assert str(none_typed.type()) == "struct<a: float64, b: float64>"
assert con.execute(none_typed) is None


@pytest.mark.notimpl(["dask"])
@pytest.mark.parametrize(
("field", "expected"),
Expand Down
51 changes: 38 additions & 13 deletions ibis/expr/types/arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@

from public import public

import ibis
import ibis.expr.operations as ops
import ibis.expr.types as ir
from ibis.common.deferred import Deferred, deferrable
from ibis.expr.types.generic import Column, Scalar, Value

if TYPE_CHECKING:
from collections.abc import Iterable

import ibis.expr.types as ir
from ibis.expr.types import dt
from ibis.expr.types.typing import V

import ibis.common.exceptions as com
Expand Down Expand Up @@ -1081,7 +1083,10 @@ def __getitem__(self, index: int | ir.IntegerValue | slice) -> ir.Column:

@public
@deferrable
def array(values: Iterable[V]) -> ArrayValue:
def array(
values: ArrayValue | Iterable[V] | None,
type: str | dt.DataType | None = None,
) -> ArrayValue:
"""Create an array expression.
If any values are [column expressions](../concepts/datatypes.qmd) the
Expand All @@ -1092,6 +1097,9 @@ def array(values: Iterable[V]) -> ArrayValue:
----------
values
An iterable of Ibis expressions or Python literals
type
An instance of `ibis.expr.datatypes.DataType` or a string indicating
the Ibis type of `value`. eg `array<float>`.
Returns
-------
Expand Down Expand Up @@ -1120,15 +1128,32 @@ def array(values: Iterable[V]) -> ArrayValue:
│ [3, 42, ... +1] │
└──────────────────────┘
>>> ibis.array([t.a, 42 + ibis.literal(5)])
┏━━━━━━━━━━━━━━━━━━━━━━┓
┃ Array()
┡━━━━━━━━━━━━━━━━━━━━━━┩
│ array<int64>
├──────────────────────┤
│ [1, 47]
│ [2, 47]
│ [3, 47]
└──────────────────────┘
>>> ibis.array([t.a, 42 + ibis.literal(5)], type="array<float>")
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Cast(Array(), array<float64>)
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
│ array<float64>
├───────────────────────────────
│ [1.0, 47.0]
│ [2.0, 47.0]
│ [3.0, 47.0]
└───────────────────────────────
"""
return ops.Array(tuple(values)).to_expr()
if values is None:
if type is None:
raise TypeError("type must be specified when values is None")
return ibis.literal(None, type=type)

if isinstance(values, ir.ArrayValue):
result = values
else:
values = tuple(values)
if len(values) == 0:
if type is None:
raise TypeError("type must be specified when values empty")
result = ibis.literal([], type=type)
else:
result = ops.Array(values).to_expr()
if type is not None:
result = result.cast(type)
return result
57 changes: 41 additions & 16 deletions ibis/expr/types/maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@

from public import public

import ibis
import ibis.expr.operations as ops
import ibis.expr.types as ir
from ibis.common.deferred import deferrable
from ibis.expr.types.generic import Column, Scalar, Value

if TYPE_CHECKING:
from collections.abc import Iterable, Mapping

import ibis.expr.types as ir
from ibis.expr.types.arrays import ArrayColumn
from ibis.expr import datatypes as dt


@public
Expand Down Expand Up @@ -435,8 +436,10 @@ def __getitem__(self, key: ir.Value) -> ir.Column:
@public
@deferrable
def map(
keys: Iterable[Any] | Mapping[Any, Any] | ArrayColumn,
values: Iterable[Any] | ArrayColumn | None = None,
keys: Iterable[Any] | Mapping[Any, Any] | ir.ArrayValue | MapValue | None,
values: Iterable[Any] | ir.ArrayValue | None = None,
*,
type: str | dt.DataType | None = None,
) -> MapValue:
"""Create a MapValue.
Expand All @@ -449,6 +452,9 @@ def map(
Keys of the map or `Mapping`. If `keys` is a `Mapping`, `values` must be `None`.
values
Values of the map or `None`. If `None`, the `keys` argument must be a `Mapping`.
type
An instance of `ibis.expr.datatypes.DataType` or a string indicating
the Ibis type of `value`. eg `map<a: float, b: string>`.
Returns
-------
Expand Down Expand Up @@ -476,16 +482,35 @@ def map(
│ ['a', 'b'] │ [1, 2] │
│ ['b'] │ [3] │
└──────────────────────┴──────────────────────┘
>>> ibis.map(t.keys, t.values)
┏━━━━━━━━━━━━━━━━━━━━━━┓
┃ Map(keys, values)
┡━━━━━━━━━━━━━━━━━━━━━━┩
│ map<string, int64>
├──────────────────────┤
│ {'a': 1, 'b': 2}
│ {'b': 3}
└──────────────────────┘
>>> ibis.map(t.keys, t.values, type="map<string, float>")
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
┃ Map(keys, Cast(values, array<float64>))
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
│ map<string, float64>
├─────────────────────────────────────────
│ {'a': 1.0, 'b': 2.0}
│ {'b': 3.0}
└─────────────────────────────────────────
"""
if values is None:
keys, values = tuple(keys.keys()), tuple(keys.values())
return ops.Map(keys, values).to_expr()
from ibis.expr import datatypes as dt

if keys is None:
if type is None:
raise TypeError("Must specify a type when keys is None")
return ibis.literal(None, type=type)

if isinstance(keys, MapValue):
result = keys
else:
if values is None:
keys, values = tuple(keys.keys()), tuple(keys.values())
type = dt.dtype(type) if type is not None else None
key_type = dt.Array(value_type=type.key_type) if type is not None else None
value_type = dt.Array(value_type=type.value_type) if type is not None else None
keys = ibis.array(keys, type=key_type)
values = ibis.array(values, type=value_type)
result = ops.Map(keys, values).to_expr()

if type is not None:
result = result.cast(type)
return result
Loading

0 comments on commit b83016c

Please sign in to comment.