Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] add list.value_counts() #2902

Merged
merged 25 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[env]
PYO3_PYTHON = "./.venv/bin/python"
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions daft/daft/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1273,6 +1273,7 @@ def dt_truncate(expr: PyExpr, interval: str, relative_to: PyExpr) -> PyExpr: ...
# ---
def explode(expr: PyExpr) -> PyExpr: ...
def list_sort(expr: PyExpr, desc: PyExpr) -> PyExpr: ...
def list_value_counts(expr: PyExpr) -> PyExpr: ...
def list_join(expr: PyExpr, delimiter: PyExpr) -> PyExpr: ...
def list_count(expr: PyExpr, mode: CountMode) -> PyExpr: ...
def list_get(expr: PyExpr, idx: PyExpr, default: PyExpr) -> PyExpr: ...
Expand Down
61 changes: 46 additions & 15 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2922,6 +2922,37 @@ def join(self, delimiter: str | Expression) -> Expression:
delimiter_expr = Expression._to_expression(delimiter)
return Expression._from_pyexpr(native.list_join(self._expr, delimiter_expr._expr))

# todo: do we want type to be a Map expression? how should we do this?
andrewgazelka marked this conversation as resolved.
Show resolved Hide resolved
def value_counts(self) -> Expression:
"""Counts the occurrences of each unique value in the list.

desmondcheongzx marked this conversation as resolved.
Show resolved Hide resolved
Returns:
Expression: A Map<X, UInt64> expression where the keys are unique elements from the
original list of type X, and the values are UInt64 counts representing
the number of times each element appears in the list.

Example:
>>> import daft
>>> df = daft.from_pydict({"letters": [["a", "b", "a"], ["b", "c", "b", "c"]]})
>>> df.with_column("value_counts", df["letters"].list.value_counts()).collect()
╭──────────────┬───────────────────╮
│ letters ┆ value_counts │
│ --- ┆ --- │
│ List[Utf8] ┆ Map[Utf8: UInt64] │
╞══════════════╪═══════════════════╡
│ [a, b, a] ┆ [{key: a, │
│ ┆ value: 2, │
│ ┆ }, {key: … │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ [b, c, b, c] ┆ [{key: b, │
│ ┆ value: 2, │
│ ┆ }, {key: … │
╰──────────────┴───────────────────╯
<BLANKLINE>
(Showing first 2 of 2 rows)
"""
return Expression._from_pyexpr(native.list_value_counts(self._expr))

def count(self, mode: CountMode = CountMode.Valid) -> Expression:
"""Counts the number of elements in each list

Expand Down Expand Up @@ -3069,21 +3100,21 @@ def get(self, key: Expression) -> Expression:
>>> df = daft.from_arrow(pa.table({"map_col": pa_array}))
>>> df = df.with_column("a", df["map_col"].map.get("a"))
>>> df.show()
╭──────────────────────────────────────┬───────╮
│ map_col ┆ a │
│ --- ┆ --- │
│ Map[Struct[key: Utf8, value: Int64]] ┆ Int64 │
╞══════════════════════════════════════╪═══════╡
│ [{key: a, ┆ 1 │
│ value: 1, ┆ │
│ }] ┆ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ [] ┆ None │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ [{key: b, ┆ None │
│ value: 2, ┆ │
│ }] ┆ │
╰──────────────────────────────────────┴───────╯
╭──────────────────┬───────╮
│ map_col ┆ a │
│ --- ┆ --- │
│ Map[Utf8: Int64] ┆ Int64 │
╞══════════════════╪═══════╡
│ [{key: a, ┆ 1 │
│ value: 1, ┆ │
│ }] ┆ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ [] ┆ None │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ [{key: b, ┆ None │
│ value: 2, ┆ │
│ }] ┆ │
╰──────────────────┴───────╯
<BLANKLINE>
(Showing first 3 of 3 rows)

Expand Down
10 changes: 8 additions & 2 deletions src/arrow2/src/array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,12 +209,18 @@ impl<O: Offset> ListArray<O> {
if O::IS_LARGE {
match data_type.to_logical_type() {
DataType::LargeList(child) => Ok(child.as_ref()),
_ => Err(Error::oos("ListArray<i64> expects DataType::LargeList")),
got => {
let msg = format!("ListArray<i64> expects DataType::LargeList, but got {got:?}");
Err(Error::oos(msg))
},
}
} else {
match data_type.to_logical_type() {
DataType::List(child) => Ok(child.as_ref()),
_ => Err(Error::oos("ListArray<i32> expects DataType::List")),
got => {
let msg = format!("ListArray<i32> expects DataType::List, but got {got:?}");
Err(Error::oos(msg))
},
}
}
}
Expand Down
67 changes: 55 additions & 12 deletions src/arrow2/src/array/map/mod.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
use super::{new_empty_array, specification::try_check_offsets_bounds, Array, ListArray};
use crate::{
bitmap::Bitmap,
datatypes::{DataType, Field},
error::Error,
offset::OffsetsBuffer,
};

use super::{new_empty_array, specification::try_check_offsets_bounds, Array};

mod ffi;
pub(super) mod fmt;
mod iterator;
#[allow(unused)]
pub use iterator::*;


/// An array representing a (key, value), both of arbitrary logical types.
#[derive(Clone)]
pub struct MapArray {
Expand Down Expand Up @@ -41,20 +41,27 @@ impl MapArray {
try_check_offsets_bounds(&offsets, field.len())?;

let inner_field = Self::try_get_field(&data_type)?;
if let DataType::Struct(inner) = inner_field.data_type() {
if inner.len() != 2 {
return Err(Error::InvalidArgumentError(
"MapArray's inner `Struct` must have 2 fields (keys and maps)".to_string(),
));
}
} else {

let inner_data_type = inner_field.data_type();
let DataType::Struct(inner) = inner_data_type else {
return Err(Error::InvalidArgumentError(
"MapArray expects `DataType::Struct` as its inner logical type".to_string(),
format!("MapArray expects `DataType::Struct` as its inner logical type, but found {inner_data_type:?}"),
));
};

let inner_len = inner.len();
if inner_len != 2 {
let msg = format!(
"MapArray's inner `Struct` must have 2 fields (keys and maps), but found {} fields",
inner_len
);
return Err(Error::InvalidArgumentError(msg));
}
if field.data_type() != inner_field.data_type() {

let field_data_type = field.data_type();
if field_data_type != inner_field.data_type() {
return Err(Error::InvalidArgumentError(
"MapArray expects `field.data_type` to match its inner DataType".to_string(),
format!("MapArray expects `field.data_type` to match its inner DataType, but found \n{field_data_type:?}\nvs\n\n\n{inner_field:?}"),
desmondcheongzx marked this conversation as resolved.
Show resolved Hide resolved
));
}

Expand Down Expand Up @@ -195,6 +202,42 @@ impl MapArray {
impl Array for MapArray {
impl_common_array!();

fn convert_logical_type(&self, target: DataType) -> Box<dyn Array> {
let outer_is_map = matches!(target, DataType::Map { .. });

if outer_is_map {
// we can do simple conversion
andrewgazelka marked this conversation as resolved.
Show resolved Hide resolved
let mut new = self.to_boxed();
new.change_type(target);
return new;
}

let DataType::LargeList(target_inner) = &target else {
panic!("MapArray can only be converted to Map or LargeList");
};

let DataType::Map(current_inner, _) = self.data_type() else {
unreachable!("Somehow DataType is not Map for a MapArray");
};

let current_inner_physical = current_inner.data_type.to_physical_type();
let target_inner_physical = target_inner.data_type.to_physical_type();

if current_inner_physical != target_inner_physical {
panic!("inner types are not equal");
}

let mut field = self.field.clone();
field.change_type(target_inner.data_type.clone());

let offsets = self.offsets().clone();
let offsets = unsafe { offsets.map_unchecked(|offset| offset as i64) };

let list = ListArray::new(target, offsets, field, self.validity.clone());

Box::new(list)
}

fn validity(&self) -> Option<&Bitmap> {
self.validity.as_ref()
}
Expand Down
Loading
Loading