[EXPRESSIONS] Implement Expression.float.is_inf (#2371)

Resolves #2316. Signed-off-by: Tai Le Manh <[email protected]>
Eventual-Inc · Jun 13, 2024 · 1b2973f · 1b2973f
1 parent 647ec43
commit 1b2973f
Show file tree

Hide file tree

Showing 15 changed files with 195 additions and 0 deletions.
diff --git a/daft/daft.pyi b/daft/daft.pyi
@@ -1042,6 +1042,7 @@ class PyExpr:
     def __hash__(self) -> int: ...
     def __reduce__(self) -> tuple: ...
     def is_nan(self) -> PyExpr: ...
+    def is_inf(self) -> PyExpr: ...
     def dt_date(self) -> PyExpr: ...
     def dt_day(self) -> PyExpr: ...
     def dt_hour(self) -> PyExpr: ...
@@ -1207,6 +1208,7 @@ class PySeries:
     def utf8_ilike(self, pattern: PySeries) -> PySeries: ...
     def utf8_substr(self, start: PySeries, length: PySeries | None = None) -> PySeries: ...
     def is_nan(self) -> PySeries: ...
+    def is_inf(self) -> PySeries: ...
     def dt_date(self) -> PySeries: ...
     def dt_day(self) -> PySeries: ...
     def dt_hour(self) -> PySeries: ...

diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
@@ -820,6 +820,21 @@ def is_nan(self) -> Expression:
         """
         return Expression._from_pyexpr(self._expr.is_nan())
 
+    def is_inf(self) -> Expression:
+        """Checks if values in the Expression are Infinity.
+
+        .. NOTE::
+            Nulls will be propagated! I.e. this operation will return a null for null values.
+
+        Example:
+            >>> # [-float("inf"), 0., float("inf"), None] -> [True, False, True, None]
+            >>> col("x").float.is_inf()
+
+        Returns:
+            Expression: Boolean Expression indicating whether values are Infinity.
+        """
+        return Expression._from_pyexpr(self._expr.is_inf())
+
 
 class ExpressionDatetimeNamespace(ExpressionNamespace):
     def date(self) -> Expression:

diff --git a/daft/series.py b/daft/series.py
@@ -635,6 +635,9 @@ class SeriesFloatNamespace(SeriesNamespace):
     def is_nan(self) -> Series:
         return Series._from_pyseries(self._series.is_nan())
 
+    def is_inf(self) -> Series:
+        return Series._from_pyseries(self._series.is_inf())
+
 
 class SeriesStringNamespace(SeriesNamespace):
     def endswith(self, suffix: Series) -> Series:

diff --git a/docs/source/api_docs/expressions.rst b/docs/source/api_docs/expressions.rst
@@ -146,6 +146,20 @@ The following methods are available under the ``expr.str`` attribute.
    Expression.str.ilike
    Expression.str.substr
 
+.. _api-float-expression-operations:
+
+Floats
+#######
+
+The following methods are available under the ``expr.float`` attribute.
+
+.. autosummary::
+   :nosignatures:
+   :toctree: doc_gen/expression_methods
+   :template: autosummary/accessor_method.rst
+
+   Expression.float.is_inf
+
 .. _api-expressions-temporal:
 
 Temporal

diff --git a/src/daft-core/src/array/ops/float.rs b/src/daft-core/src/array/ops/float.rs
@@ -5,6 +5,7 @@ use crate::{
 use common_error::DaftResult;
 use num_traits::Float;
 
+use super::DaftIsInf;
 use super::DaftIsNan;
 
 use super::as_arrow::AsArrow;
@@ -38,3 +39,32 @@ impl DaftIsNan for DataArray<NullType> {
         )))
     }
 }
+
+impl<T> DaftIsInf for DataArray<T>
+where
+    T: DaftFloatType,
+    <T as DaftNumericType>::Native: Float,
+{
+    type Output = DaftResult<DataArray<BooleanType>>;
+
+    fn is_inf(&self) -> Self::Output {
+        let arrow_array = self.as_arrow();
+        let result_arrow_array = arrow2::array::BooleanArray::from_trusted_len_values_iter(
+            arrow_array.values_iter().map(|v| v.is_infinite()),
+        )
+        .with_validity(arrow_array.validity().cloned());
+        Ok(BooleanArray::from((self.name(), result_arrow_array)))
+    }
+}
+
+impl DaftIsInf for DataArray<NullType> {
+    type Output = DaftResult<DataArray<BooleanType>>;
+
+    fn is_inf(&self) -> Self::Output {
+        Ok(BooleanArray::from((
+            self.name(),
+            arrow2::array::BooleanArray::from_slice(vec![false; self.len()])
+                .with_validity(Some(arrow2::bitmap::Bitmap::from(vec![false; self.len()]))),
+        )))
+    }
+}
diff --git a/src/daft-core/src/array/ops/mod.rs b/src/daft-core/src/array/ops/mod.rs
@@ -120,6 +120,11 @@ pub trait DaftIsNan {
     fn is_nan(&self) -> Self::Output;
 }
 
+pub trait DaftIsInf {
+    type Output;
+    fn is_inf(&self) -> Self::Output;
+}
+
 pub type VecIndices = Vec<u64>;
 pub type GroupIndices = Vec<VecIndices>;
 pub type GroupIndicesPair = (VecIndices, GroupIndices);

diff --git a/src/daft-core/src/python/series.rs b/src/daft-core/src/python/series.rs
@@ -461,6 +461,10 @@ impl PySeries {
         Ok(self.series.is_nan()?.into())
     }
 
+    pub fn is_inf(&self) -> PyResult<Self> {
+        Ok(self.series.is_inf()?.into())
+    }
+
     pub fn dt_date(&self) -> PyResult<Self> {
         Ok(self.series.dt_date()?.into())
     }

diff --git a/src/daft-core/src/series/ops/float.rs b/src/daft-core/src/series/ops/float.rs
@@ -11,4 +11,11 @@ impl Series {
             Ok(DaftIsNan::is_nan(self.downcast::<<$T as DaftDataType>::ArrayType>()?)?.into_series())
         })
     }
+
+    pub fn is_inf(&self) -> DaftResult<Series> {
+        use crate::array::ops::DaftIsInf;
+        with_match_float_and_null_daft_types!(self.data_type(), |$T| {
+            Ok(DaftIsInf::is_inf(self.downcast::<<$T as DaftDataType>::ArrayType>()?)?.into_series())
+        })
+    }
 }
diff --git a/src/daft-dsl/src/functions/float/is_inf.rs b/src/daft-dsl/src/functions/float/is_inf.rs
@@ -0,0 +1,51 @@
+use daft_core::{
+    datatypes::{DataType, Field},
+    schema::Schema,
+    series::Series,
+};
+
+use crate::ExprRef;
+
+use crate::functions::FunctionExpr;
+use common_error::{DaftError, DaftResult};
+
+use super::super::FunctionEvaluator;
+
+pub(super) struct IsInfEvaluator {}
+
+impl FunctionEvaluator for IsInfEvaluator {
+    fn fn_name(&self) -> &'static str {
+        "is_inf"
+    }
+
+    fn to_field(&self, inputs: &[ExprRef], schema: &Schema, _: &FunctionExpr) -> DaftResult<Field> {
+        match inputs {
+            [data] => match data.to_field(schema) {
+                Ok(data_field) => match &data_field.dtype {
+                    // DataType::Float16 |
+                    DataType::Float32 | DataType::Float64 => {
+                        Ok(Field::new(data_field.name, DataType::Boolean))
+                    }
+                    _ => Err(DaftError::TypeError(format!(
+                        "Expects input to is_inf to be float, but received {data_field}",
+                    ))),
+                },
+                Err(e) => Err(e),
+            },
+            _ => Err(DaftError::SchemaMismatch(format!(
+                "Expected 1 input args, got {}",
+                inputs.len()
+            ))),
+        }
+    }
+
+    fn evaluate(&self, inputs: &[Series], _: &FunctionExpr) -> DaftResult<Series> {
+        match inputs {
+            [data] => data.is_inf(),
+            _ => Err(DaftError::ValueError(format!(
+                "Expected 1 input args, got {}",
+                inputs.len()
+            ))),
+        }
+    }
+}
diff --git a/src/daft-dsl/src/functions/float/mod.rs b/src/daft-dsl/src/functions/float/mod.rs
@@ -1,5 +1,7 @@
+mod is_inf;
 mod is_nan;
 
+use is_inf::IsInfEvaluator;
 use is_nan::IsNanEvaluator;
 use serde::{Deserialize, Serialize};
 
@@ -10,6 +12,7 @@ use super::FunctionEvaluator;
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
 pub enum FloatExpr {
     IsNan,
+    IsInf,
 }
 
 impl FloatExpr {
@@ -18,6 +21,7 @@ impl FloatExpr {
         use FloatExpr::*;
         match self {
             IsNan => &IsNanEvaluator {},
+            IsInf => &IsInfEvaluator {},
         }
     }
 }
@@ -29,3 +33,11 @@ pub fn is_nan(data: ExprRef) -> ExprRef {
     }
     .into()
 }
+
+pub fn is_inf(data: ExprRef) -> ExprRef {
+    Expr::Function {
+        func: super::FunctionExpr::Float(FloatExpr::IsInf),
+        inputs: vec![data],
+    }
+    .into()
+}
diff --git a/src/daft-dsl/src/python.rs b/src/daft-dsl/src/python.rs
@@ -475,6 +475,11 @@ impl PyExpr {
         Ok(is_nan(self.into()).into())
     }
 
+    pub fn is_inf(&self) -> PyResult<Self> {
+        use functions::float::is_inf;
+        Ok(is_inf(self.into()).into())
+    }
+
     pub fn dt_date(&self) -> PyResult<Self> {
         use functions::temporal::date;
         Ok(date(self.into()).into())

diff --git a/tests/expressions/test_expressions.py b/tests/expressions/test_expressions.py
@@ -264,6 +264,13 @@ def test_float_is_nan() -> None:
     assert output == "is_nan(col(a))"
 
 
+def test_float_is_inf() -> None:
+    a = col("a")
+    c = a.float.is_inf()
+    output = repr(c)
+    assert output == "is_inf(col(a))"
+
+
 def test_date_lit_post_epoch() -> None:
     d = lit(date(2022, 1, 1))
     output = repr(d)

diff --git a/tests/expressions/typing/test_float.py b/tests/expressions/typing/test_float.py
@@ -12,3 +12,12 @@ def test_float_is_nan(unary_data_fixture):
         run_kernel=unary_data_fixture.float.is_nan,
         resolvable=unary_data_fixture.datatype() in (DataType.float32(), DataType.float64()),
     )
+
+
+def test_float_is_inf(unary_data_fixture):
+    assert_typing_resolve_vs_runtime_behavior(
+        data=[unary_data_fixture],
+        expr=col(unary_data_fixture.name()).float.is_inf(),
+        run_kernel=unary_data_fixture.float.is_inf,
+        resolvable=unary_data_fixture.datatype() in (DataType.float32(), DataType.float64()),
+    )
diff --git a/tests/series/test_float.py b/tests/series/test_float.py
@@ -28,3 +28,27 @@ def test_float_is_nan_all_null() -> None:
     s = Series.from_arrow(pa.array([None, None, None]))
     result = s.float.is_nan()
     assert result.to_pylist() == [None, None, None]
+
+
+def test_float_is_inf() -> None:
+    s = Series.from_arrow(pa.array([-float("inf"), 0.0, np.inf]))
+    result = s.float.is_inf()
+    assert result.to_pylist() == [True, False, True]
+
+
+def test_float_is_inf_with_nulls() -> None:
+    s = Series.from_arrow(pa.array([-np.inf, None, 1.0, None, float("inf")]))
+    result = s.float.is_inf()
+    assert result.to_pylist() == [True, None, False, None, True]
+
+
+def test_float_is_inf_empty() -> None:
+    s = Series.from_arrow(pa.array([], type=pa.float64()))
+    result = s.float.is_inf()
+    assert result.to_pylist() == []
+
+
+def test_float_is_inf_all_null() -> None:
+    s = Series.from_arrow(pa.array([None, None, None]))
+    result = s.float.is_inf()
+    assert result.to_pylist() == [None, None, None]
diff --git a/tests/table/test_filter.py b/tests/table/test_filter.py
@@ -212,6 +212,13 @@ def test_table_float_is_nan() -> None:
     assert result_table.to_pydict() == {"a": [False, True, False, None, True]}
 
 
+def test_table_float_is_inf() -> None:
+    table = MicroPartition.from_pydict({"a": [-np.inf, 0.0, None, float("inf")]})
+    result_table = table.eval_expression_list([col("a").float.is_inf()])
+    # Note that null entries are _not_ treated as float NaNs.
+    assert result_table.to_pydict() == {"a": [True, False, None, True]}
+
+
 def test_table_if_else() -> None:
     table = MicroPartition.from_arrow(
         pa.Table.from_pydict({"ones": [1, 1, 1], "zeros": [0, 0, 0], "pred": [True, False, None]})