Skip to content

Commit

Permalink
[FEAT]: add coalesce to dataframe and SQL (#3482)
Browse files Browse the repository at this point in the history
  • Loading branch information
universalmind303 authored Dec 5, 2024
1 parent 3eeba09 commit 78c738f
Show file tree
Hide file tree
Showing 13 changed files with 399 additions and 7 deletions.
3 changes: 2 additions & 1 deletion daft/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def refresh_logger() -> None:
from daft.dataframe import DataFrame
from daft.logical.schema import Schema
from daft.datatype import DataType, TimeUnit
from daft.expressions import Expression, col, lit, interval
from daft.expressions import Expression, col, lit, interval, coalesce
from daft.io import (
DataCatalogTable,
DataCatalogType,
Expand Down Expand Up @@ -135,4 +135,5 @@ def refresh_logger() -> None:
"sql",
"sql_expr",
"to_struct",
"coalesce",
]
1 change: 1 addition & 0 deletions daft/daft/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1176,6 +1176,7 @@ def minhash(
seed: int = 1,
hash_function: Literal["murmurhash3", "xxhash", "sha1"] = "murmurhash3",
) -> PyExpr: ...
def coalesce(exprs: list[PyExpr]) -> PyExpr: ...

# -----
# SQL functions
Expand Down
4 changes: 2 additions & 2 deletions daft/expressions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from __future__ import annotations

from .expressions import Expression, ExpressionsProjection, col, lit, interval
from .expressions import Expression, ExpressionsProjection, col, lit, interval, coalesce

__all__ = ["Expression", "ExpressionsProjection", "col", "lit", "interval"]
__all__ = ["Expression", "ExpressionsProjection", "col", "lit", "interval", "coalesce"]
31 changes: 31 additions & 0 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,37 @@ def interval(
return Expression._from_pyexpr(lit_value)


def coalesce(*args: Expression) -> Expression:
"""Returns the first non-null value in a list of expressions. If all inputs are null, returns null.
Example:
>>> import daft
>>> df = daft.from_pydict({"x": [1, None, 3], "y": [None, 2, None]})
>>> df = df.with_column("first_valid", daft.coalesce(df["x"], df["y"]))
>>> df.show()
╭───────┬───────┬─────────────╮
│ x ┆ y ┆ first_valid │
│ --- ┆ --- ┆ --- │
│ Int64 ┆ Int64 ┆ Int64 │
╞═══════╪═══════╪═════════════╡
│ 1 ┆ None ┆ 1 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ None ┆ 2 ┆ 2 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ None ┆ 3 │
╰───────┴───────┴─────────────╯
<BLANKLINE>
(Showing first 3 of 3 rows)
Args:
*args: Two or more expressions to coalesce
Returns:
Expression: Expression containing first non-null value encountered when evaluating arguments in order
"""
return Expression._from_pyexpr(native.coalesce([arg._expr for arg in args]))


class Expression:
_expr: _PyExpr = None # type: ignore

Expand Down
1 change: 0 additions & 1 deletion src/daft-core/src/array/from_iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ impl BinaryArray {
.unwrap()
}
}

impl FixedSizeBinaryArray {
pub fn from_iter<S: AsRef<[u8]>>(
name: &str,
Expand Down
297 changes: 297 additions & 0 deletions src/daft-functions/src/coalesce.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
use common_error::{DaftError, DaftResult};
use daft_core::{
prelude::{BooleanArray, DaftLogical, Field, Schema},
series::{IntoSeries, Series},
utils::supertype::try_get_supertype,
};
use daft_dsl::{
functions::{ScalarFunction, ScalarUDF},
ExprRef,
};
use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct Coalesce {}

#[typetag::serde]
impl ScalarUDF for Coalesce {
fn as_any(&self) -> &dyn std::any::Any {
self
}

fn name(&self) -> &'static str {
"coalesce"
}

fn to_field(&self, inputs: &[ExprRef], schema: &Schema) -> DaftResult<Field> {
match inputs {
[] => Err(DaftError::SchemaMismatch(
"Expected at least 1 input args, got 0".to_string(),
)),
[input] => {
let input_field = input.to_field(schema)?;
Ok(input_field)
}
_ => {
let first_field = inputs[0].to_field(schema)?;
let mut output_dtype = first_field.dtype.clone();

for input in inputs {
let lhs = input.to_field(schema)?.dtype;
let rhs = &first_field.dtype;
output_dtype = try_get_supertype(&lhs, rhs)?;

if try_get_supertype(&lhs, rhs).is_err() {
return Err(DaftError::SchemaMismatch(format!(
"All input fields must have the same data type. Got {lhs} and {rhs}"
)));
}
}
Ok(Field::new(first_field.name, output_dtype))
}
}
}

fn evaluate(&self, inputs: &[Series]) -> DaftResult<Series> {
match inputs.len() {
0 => Err(DaftError::ComputeError("No inputs provided".to_string())),
1 => Ok(inputs[0].clone()),
_ => {
let name = inputs[0].name();
let dtype = inputs[0].data_type();
let len = inputs[0].len();
// the first input is not null, so no work to do
if inputs[0].validity().is_none() {
return Ok(inputs[0].clone());
}

let mut current_value = Series::full_null(name, dtype, len);
let remainder = BooleanArray::from_values(name, vec![true; len].into_iter());
let all_false = BooleanArray::from_values(name, vec![false; len].into_iter());
let mut remainder = remainder.into_series();

for input in inputs {
let to_apply = remainder.and(&input.not_null()?)?;
current_value = input.if_else(&current_value, &to_apply)?;

remainder = remainder.and(&input.is_null()?)?;

// exit early if all values are filled
if remainder.bool().unwrap() == &all_false {
break;
}
}

Ok(current_value.rename(name))
}
}
}
}

#[must_use]
/// Coalesce returns the first non-null value in a list of expressions.
/// Returns the first non-null value from a sequence of expressions.
///
/// # Arguments
/// * `inputs` - A vector of expressions to evaluate in order
pub fn coalesce(inputs: Vec<ExprRef>) -> ExprRef {
ScalarFunction::new(Coalesce {}, inputs).into()
}

#[cfg(test)]
mod tests {
use common_error::DaftError;
use daft_core::{
prelude::{DataType, Field, FullNull, Int8Array, Schema, Utf8Array},
series::{IntoSeries, Series},
};
use daft_dsl::{col, functions::ScalarUDF, lit, null_lit};

#[test]
fn test_coalesce_0() {
let s0 = Int8Array::from_iter(
Field::new("s0", DataType::Int8),
vec![None, None, Some(10), Some(11), None].into_iter(),
)
.into_series();
let s1 = Int8Array::from_iter(
Field::new("s1", DataType::Int8),
vec![None, Some(2), Some(3), None, None].into_iter(),
)
.into_series();
let s2 = Int8Array::from_iter(
Field::new("s2", DataType::Int8),
vec![None, Some(1), Some(4), Some(4), Some(10)].into_iter(),
)
.into_series();

let coalesce = super::Coalesce {};
let output = coalesce.evaluate(&[s0, s1, s2]).unwrap();
let actual = output.i8().unwrap();
let expected = Int8Array::from_iter(
Field::new("s0", DataType::Int8),
vec![None, Some(2), Some(10), Some(11), Some(10)].into_iter(),
);

assert_eq!(actual, &expected);
}

#[test]
fn test_coalesce_1() {
let s0 = Int8Array::from_iter(
Field::new("s0", DataType::Int8),
vec![None, None, Some(10), Some(11), None].into_iter(),
)
.into_series();

let s1 = Int8Array::from_iter(
Field::new("s1", DataType::Int8),
vec![None, Some(2), Some(3), None, None].into_iter(),
)
.into_series();

let coalesce = super::Coalesce {};
let output = coalesce.evaluate(&[s0, s1]).unwrap();
let actual = output.i8().unwrap();
let expected = Int8Array::from_iter(
Field::new("s0", DataType::Int8),
vec![None, Some(2), Some(10), Some(11), None].into_iter(),
);

assert_eq!(actual, &expected);
}

#[test]
fn test_coalesce_no_args() {
let coalesce = super::Coalesce {};
let output = coalesce.evaluate(&[]);

assert!(output.is_err());
}

#[test]
fn test_coalesce_one_arg() {
let s0 = Int8Array::from_iter(
Field::new("s0", DataType::Int8),
vec![None, None, Some(10), Some(11), None].into_iter(),
)
.into_series();

let coalesce = super::Coalesce {};
let output = coalesce.evaluate(&[s0.clone()]).unwrap();
// can't directly compare as null != null
let output = output.i8().unwrap();
let s0 = s0.i8().unwrap();
assert_eq!(output, s0);
}

#[test]
fn test_coalesce_full_nulls() {
let s0 = Series::full_null("s0", &DataType::Utf8, 100);
let s1 = Series::full_null("s1", &DataType::Utf8, 100);
let s2 = Series::full_null("s2", &DataType::Utf8, 100);

let coalesce = super::Coalesce {};
let output = coalesce.evaluate(&[s0, s1, s2]).unwrap();
let actual = output.utf8().unwrap();
let expected = Utf8Array::full_null("s0", &DataType::Utf8, 100);

assert_eq!(actual, &expected);
}

#[test]
fn test_coalesce_with_mismatched_types() {
let s0 = Int8Array::from_iter(
Field::new("s0", DataType::Int8),
vec![None, None, Some(10), Some(11), None].into_iter(),
)
.into_series();
let s1 = Int8Array::from_iter(
Field::new("s1", DataType::Int8),
vec![None, Some(2), Some(3), None, None].into_iter(),
)
.into_series();
let s2 = Utf8Array::from_iter(
"s2",
vec![
None,
Some("hello"),
Some("world"),
Some("hello"),
Some("world"),
]
.into_iter(),
)
.into_series();

let coalesce = super::Coalesce {};
let output = coalesce.evaluate(&[s0, s1, s2]);

let expected = Utf8Array::from_iter(
"s2",
vec![None, Some("2"), Some("10"), Some("11"), Some("world")].into_iter(),
);
assert_eq!(output.unwrap().utf8().unwrap(), &expected);
}

#[test]
fn test_to_field() {
let col_0 = null_lit().alias("s0");
let fallback = lit(0);

let schema = Schema::new(vec![
Field::new("s0", DataType::Int32),
Field::new("s1", DataType::Int32),
])
.unwrap();
let expected = Field::new("s0", DataType::Int32);

let coalesce = super::Coalesce {};
let output = coalesce.to_field(&[col_0, fallback], &schema).unwrap();
assert_eq!(output, expected);
}

#[test]
fn test_to_field_with_mismatched_types() {
let col_0 = col("s0");
let col_1 = col("s1");
let fallback = lit("not found");

let schema = Schema::new(vec![
Field::new("s0", DataType::Int8),
Field::new("s1", DataType::Int8),
Field::new("s2", DataType::Utf8),
])
.unwrap();
let expected = Field::new("s0", DataType::Utf8);

let coalesce = super::Coalesce {};
let output = coalesce
.to_field(&[col_0, col_1, fallback], &schema)
.unwrap();
assert_eq!(output, expected);
}

#[test]
fn test_to_field_with_incompatible_types() {
let col_0 = col("s0");
let col_1 = col("s1");
let col_2 = lit(1u32);

let schema = Schema::new(vec![
Field::new("s0", DataType::Date),
Field::new("s1", DataType::Boolean),
Field::new("s2", DataType::UInt32),
]);
let expected = "could not determine supertype of Boolean and Date".to_string();
let coalesce = super::Coalesce {};
let DaftError::TypeError(e) = coalesce
.to_field(&[col_0, col_1, col_2], &schema.unwrap())
.unwrap_err()
else {
panic!("Expected error")
};

assert_eq!(e, expected);
}
}
1 change: 1 addition & 0 deletions src/daft-functions/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#![feature(async_closure)]
pub mod coalesce;
pub mod count_matches;
pub mod distance;
pub mod float;
Expand Down
7 changes: 7 additions & 0 deletions src/daft-functions/src/python/coalesce.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
use daft_dsl::python::PyExpr;
use pyo3::pyfunction;

#[pyfunction]
pub fn coalesce(exprs: Vec<PyExpr>) -> PyExpr {
crate::coalesce::coalesce(exprs.into_iter().map(|expr| expr.into()).collect()).into()
}
Loading

0 comments on commit 78c738f

Please sign in to comment.