ibis-project · NickCrews · Jun 29, 2024 · Jun 29, 2024 · cpcloud · Jun 30, 2024
diff --git a/ibis/backends/pandas/executor.py b/ibis/backends/pandas/executor.py
@@ -49,7 +49,9 @@ def visit(cls, op: ops.Node, **kwargs):
 
     @classmethod
     def visit(cls, op: ops.Literal, value, dtype):
-        if dtype.is_interval():
+        if value is None:
+            value = None
+        elif dtype.is_interval():
             value = pd.Timedelta(value, dtype.unit.short)
         elif dtype.is_array():
             value = np.array(value)
@@ -219,6 +221,11 @@ def visit(cls, op: ops.FindInSet, needle, values):
         result = np.select(condlist, choicelist, default=-1)
         return pd.Series(result, name=op.name)
 
+    @classmethod
+    def visit(cls, op: ops.EmptyArray, dtype):
+        pdt = PandasType.from_ibis(dtype)
+        return np.array([], dtype=pdt)
+
     @classmethod
     def visit(cls, op: ops.Array, exprs):
         return cls.rowwise(lambda row: np.array(row, dtype=object), exprs)

diff --git a/ibis/backends/polars/compiler.py b/ibis/backends/polars/compiler.py
@@ -87,12 +87,7 @@ def literal(op, **_):
     value = op.value
     dtype = op.dtype
 
-    if dtype.is_array():
-        value = pl.Series("", value)
-        typ = PolarsType.from_ibis(dtype)
-        val = pl.lit(value, dtype=typ)
-        return val.implode()
-    elif dtype.is_struct():
+    if dtype.is_struct():
         values = [
             pl.lit(v, dtype=PolarsType.from_ibis(dtype[k])).alias(k)
             for k, v in value.items()
@@ -106,7 +101,7 @@ def literal(op, **_):
         return pl.lit(value)
     else:
         typ = PolarsType.from_ibis(dtype)
-        return pl.lit(op.value, dtype=typ)
+        return pl.lit(value, dtype=typ)
 
 
 _TIMESTAMP_SCALE_TO_UNITS = {
@@ -973,6 +968,12 @@ def array_concat(op, **kw):
     return result
 
 
+@translate.register(ops.EmptyArray)
+def empty_array(op, **kw):
+    pdt = PolarsType.from_ibis(op.dtype)
+    return pl.lit([], dtype=pdt)
+
+
 @translate.register(ops.Array)
 def array_column(op, **kw):
     cols = [translate(col, **kw) for col in op.exprs]

diff --git a/ibis/backends/sql/compiler.py b/ibis/backends/sql/compiler.py
@@ -1019,6 +1019,9 @@ def visit_InSubquery(self, op, *, rel, needle):
             query = sg.select(STAR).from_(query)
         return needle.isin(query=query)
 
+    def visit_EmptyArray(self, op, *, dtype):
+        return self.cast(self.f.array(), dtype)
+
     def visit_Array(self, op, *, exprs):
         return self.f.array(*exprs)
 

diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py
@@ -31,6 +31,7 @@
     PySparkAnalysisException,
     TrinoUserError,
 )
+from ibis.common.annotations import ValidationError
 from ibis.common.collections import frozendict
 
 pytestmark = [
@@ -72,6 +73,71 @@
 # list.
 
 
+def test_array_factory(con):
+    a = ibis.array([1, 2, 3])
+    assert a.type() == dt.Array(value_type=dt.Int8)
+    assert con.execute(a) == [1, 2, 3]
+
+    a2 = ibis.array(a)
+    assert a.type() == dt.Array(value_type=dt.Int8)
+    assert con.execute(a2) == [1, 2, 3]
+
+
+def test_array_factory_typed(con):
+    typed = ibis.array([1, 2, 3], type="array<string>")
+    assert con.execute(typed) == ["1", "2", "3"]
+
+    typed2 = ibis.array(ibis.array([1, 2, 3]), type="array<string>")
+    assert con.execute(typed2) == ["1", "2", "3"]
+
+
+@pytest.mark.notimpl("flink", raises=Py4JJavaError)
+def test_array_factory_empty(con):
+    with pytest.raises(ValidationError):
+        ibis.array([])
+
+    empty_typed = ibis.array([], type="array<string>")
+    assert empty_typed.type() == dt.Array(value_type=dt.string)
+    assert con.execute(empty_typed) == []
+
+
+@pytest.mark.notyet(
+    "clickhouse", raises=ClickHouseDatabaseError, reason="nested types can't be NULL"
+)
+def test_array_factory_null(con):
+    with pytest.raises(ValidationError):
+        ibis.array(None)
+    with pytest.raises(ValidationError):
+        ibis.array(None, type="int64")
+    none_typed = ibis.array(None, type="array<string>")
+    assert none_typed.type() == dt.Array(value_type=dt.string)
+    assert con.execute(none_typed) is None
+
+    nones = ibis.array([None, None], type="array<string>")
+    assert nones.type() == dt.Array(value_type=dt.string)
+    assert con.execute(nones) == [None, None]
+
+    # Execute a real value here, so the backends that don't support arrays
+    # actually xfail as we expect them to.
+    # Otherwise would have to @mark.xfail every test in this file besides this one.
+    assert con.execute(ibis.array([1, 2])) == [1, 2]
+
+
+@pytest.mark.broken(
+    ["datafusion", "flink", "polars"],
+    raises=AssertionError,
+    reason="[None, 1] executes to [np.nan, 1.0]",
+)
+def test_array_factory_null_mixed(con):
+    none_and_val = ibis.array([None, 1])
+    assert none_and_val.type() == dt.Array(value_type=dt.Int8)
+    assert con.execute(none_and_val) == [None, 1]
+
+    none_and_val_typed = ibis.array([None, 1], type="array<string>")
+    assert none_and_val_typed.type() == dt.Array(value_type=dt.String)
+    assert con.execute(none_and_val_typed) == [None, "1"]
+
+
 def test_array_column(backend, alltypes, df):
     expr = ibis.array(
         [alltypes["double_col"], alltypes["double_col"], 5.0, ibis.literal(6.0)]
@@ -1356,7 +1422,7 @@ def test_unnest_range(con):
                 pytest.mark.notyet(["bigquery"], raises=GoogleBadRequest),
                 pytest.mark.broken(
                     ["polars"],
-                    reason="expression input not supported with nested arrays",
+                    reason="upstream polars bug: https://github.com/pola-rs/polars/issues/17294",
                     raises=TypeError,
                 ),
             ],

diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py
@@ -1431,13 +1431,12 @@ def query(t, group_cols):
     snapshot.assert_match(str(ibis.to_sql(t3, dialect=con.name)), "out.sql")
 
 
-@pytest.mark.notimpl(["oracle", "exasol"], raises=com.OperationNotDefinedError)
-@pytest.mark.notimpl(["druid"], raises=AssertionError)
 @pytest.mark.notyet(
-    ["datafusion", "impala", "mssql", "mysql", "sqlite"],
+    ["datafusion", "exasol", "impala", "mssql", "mysql", "oracle", "sqlite"],
     reason="backend doesn't support arrays and we don't implement pivot_longer with unions yet",
-    raises=com.OperationNotDefinedError,
+    raises=(com.OperationNotDefinedError, com.UnsupportedBackendType),
 )
+@pytest.mark.notimpl(["druid"], raises=AssertionError)
 @pytest.mark.broken(
     ["trino"],
     reason="invalid code generated for unnesting a struct",

diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py
@@ -30,8 +30,18 @@
             ibis.struct(dict(abc=432)),
             marks=[
                 pytest.mark.never(
-                    ["impala", "mysql", "sqlite", "mssql", "exasol"],
-                    raises=(NotImplementedError, exc.UnsupportedBackendType),
+                    [
+                        "exasol",
+                        "impala",
+                        "mysql",
+                        "sqlite",
+                        "mssql",
+                    ],
+                    raises=(
+                        exc.OperationNotDefinedError,
+                        NotImplementedError,
+                        exc.UnsupportedBackendType,
+                    ),
                     reason="structs not supported in the backend",
                 ),
                 pytest.mark.notimpl(
@@ -104,7 +114,7 @@ def test_isin_bug(con, snapshot):
 @pytest.mark.notyet(
     ["datafusion", "exasol", "oracle", "flink", "risingwave"],
     reason="no unnest support",
-    raises=exc.OperationNotDefinedError,
+    raises=(exc.OperationNotDefinedError, exc.UnsupportedBackendType),
 )
 @pytest.mark.notyet(
     ["sqlite", "mysql", "druid", "impala", "mssql"], reason="no unnest support upstream"

diff --git a/ibis/backends/tests/test_string.py b/ibis/backends/tests/test_string.py
@@ -835,18 +835,18 @@ def test_capitalize(con, inp, expected):
         assert pd.isnull(result)
 
 
+@pytest.mark.never(
+    ["exasol", "impala", "mssql", "mysql", "sqlite"],
+    reason="Backend doesn't support arrays",
+    raises=(com.OperationNotDefinedError, com.UnsupportedBackendType),
+)
 @pytest.mark.notimpl(
     [
         "dask",
         "pandas",
         "polars",
         "oracle",
         "flink",
-        "sqlite",
-        "mssql",
-        "mysql",
-        "exasol",
-        "impala",
     ],
     raises=com.OperationNotDefinedError,
 )

diff --git a/ibis/expr/operations/arrays.py b/ibis/expr/operations/arrays.py
@@ -2,27 +2,34 @@
 
 from __future__ import annotations
 
-from typing import Optional
+from typing import Annotated, Optional
 
 from public import public
 
 import ibis.expr.datashape as ds
 import ibis.expr.datatypes as dt
 import ibis.expr.rules as rlz
 from ibis.common.annotations import attribute
+from ibis.common.patterns import Length  # noqa: TCH001
 from ibis.common.typing import VarTuple  # noqa: TCH001
 from ibis.expr.operations.core import Unary, Value
 
 
+@public
+class EmptyArray(Value):
+    """Construct an array with 0 elements."""
+
+    dtype: dt.Array
+    shape = ds.scalar
+
+
 @public
 class Array(Value):
-    """Construct an array."""
+    """Construct an array with 1+ elements. Use `EmptyArray` for empty arrays."""
 
-    exprs: VarTuple[Value]
+    exprs: Annotated[VarTuple[Value], Length(at_least=1)]
 
-    @attribute
-    def shape(self):
-        return rlz.highest_precedence_shape(self.exprs)
+    shape = rlz.shape_like("exprs")
 
     @attribute
     def dtype(self):

diff --git a/ibis/expr/types/arrays.py b/ibis/expr/types/arrays.py
@@ -5,16 +5,17 @@
 
 from public import public
 
+import ibis
+import ibis.expr.datatypes as dt
 import ibis.expr.operations as ops
+import ibis.expr.types as ir
+from ibis.common.annotations import ValidationError
 from ibis.common.deferred import Deferred, deferrable
 from ibis.expr.types.generic import Column, Scalar, Value
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Iterable
 
-    import ibis.expr.types as ir
-    from ibis.expr.types.typing import V
-
 import ibis.common.exceptions as com
 
 
@@ -1067,7 +1068,11 @@
 
 @public
 @deferrable
-def array(values: Iterable[V]) -> ArrayValue:
+def array(
+    values: ArrayValue | Iterable | ir.NullValue | None,
+    *,
+    type: str | dt.DataType | None = None,
+) -> ArrayValue:
     """Create an array expression.
 
     If any values are [column expressions](../concepts/datatypes.qmd) the
@@ -1078,6 +1083,9 @@
     ----------
     values
         An iterable of Ibis expressions or Python literals
+    type
+        An instance of `ibis.expr.datatypes.DataType` or a string indicating
+        the Ibis type of `value`. eg `array<float>`.
 
     Returns
     -------
@@ -1108,15 +1116,49 @@
     │ [3, 42, ... +1]      │
     └──────────────────────┘
 
-    >>> ibis.array([t.a, 42 + ibis.literal(5)])
+    >>> ibis.array([t.a, 42 + ibis.literal(5)], type="array<float>")
     ┏━━━━━━━━━━━━━━━━━━━━━━┓
     ┃ Array()              ┃
     ┡━━━━━━━━━━━━━━━━━━━━━━┩
-    │ array<int64>         │
+    │ array<float64>       │
     ├──────────────────────┤
-    │ [1, 47]              │
-    │ [2, 47]              │
-    │ [3, 47]              │
+    │ [1.0, 47.0]          │
+    │ [2.0, 47.0]          │
+    │ [3.0, 47.0]          │
     └──────────────────────┘
     """
-    return ops.Array(tuple(values)).to_expr()
+    type = dt.dtype(type) if type is not None else None
+    if type is not None and not isinstance(type, dt.Array):
+        raise ValidationError(f"type must be an array, got {type}")
+
+    if isinstance(values, ir.Value):
+        if type is not None:
+            return values.cast(type)
+        elif isinstance(values, ArrayValue):
+            return values
+        else:
+            raise ValidationError(
+                f"If no type passed, values must be an array, got {values.type()}"
+            )
+
+    if values is None:
+        if type is None:
+            raise ValidationError("If values is None/NULL, type must be provided")
+        return ir.null(type)
+
+    values = tuple(values)
+    if len(values) == 0:
-    if len(values) == 0:
+    if not values:
-    if len(values) == 0:
+    if not values:
+        if type is None:
+            raise ValidationError("If values is empty, type must be provided")
+        return ops.EmptyArray(type).to_expr()
+    else:
+        value_type = type.value_type if type is not None else None
+        values = [_value(v, value_type) for v in values]
+        return ops.Array(values).to_expr()
+
+
+def _value(x, type) -> ir.Value:
+    if isinstance(x, (ir.Value, Deferred)):
+        return x.cast(type) if type is not None else x
+    else:
+        return ibis.literal(x, type=type)