Skip to content

Commit

Permalink
refine docstring and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
advancedxy committed Dec 19, 2024
1 parent eeb1282 commit 0edf691
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 127 deletions.
74 changes: 66 additions & 8 deletions daft/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2546,28 +2546,86 @@ def intersect(self, other: "DataFrame") -> "DataFrame":
def intersect_all(self, other: "DataFrame") -> "DataFrame":
"""Returns the intersection of two DataFrames, including duplicates.
:param other:
:return:
Example:
>>> import daft
>>> df1 = daft.from_pydict({"a": [1, 2, 2], "b": [4, 6, 6]})
>>> df2 = daft.from_pydict({"a": [1, 1, 2, 2], "b": [4, 4, 6, 6]})
>>> df1.intersect_all(df2).collect()
╭───────┬───────╮
│ a ┆ b │
│ --- ┆ --- │
│ Int64 ┆ Int64 │
╞═══════╪═══════╡
│ 1 ┆ 4 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ 6 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ 6 │
╰───────┴───────╯
<BLANKLINE>
(Showing first 3 of 3 rows)
Args:
other (DataFrame): DataFrame to intersect with
Returns:
DataFrame: DataFrame with the intersection of the two DataFrames, including duplicates
"""
builder = self._builder.intersect_all(other._builder)
return DataFrame(builder)

@DataframePublicAPI
def except_distinct(self, other: "DataFrame") -> "DataFrame":
"""
"""Returns the set difference of two DataFrames.
:param other:
:return:
Example:
>>> import daft
>>> df1 = daft.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
>>> df2 = daft.from_pydict({"a": [1, 2, 3], "b": [4, 8, 6]})
>>> df1.except_distinct(df2).collect()
╭───────┬───────╮
│ a ┆ b │
│ --- ┆ --- │
│ Int64 ┆ Int64 │
╞═══════╪═══════╡
│ 2 ┆ 5 │
╰───────┴───────╯
<BLANKLINE>
(Showing first 1 of 1 rows)
Args:
other (DataFrame): DataFrame to except with
Returns:
DataFrame: DataFrame with the set difference of the two DataFrames
"""
builder = self._builder.except_distinct(other._builder)
return DataFrame(builder)

@DataframePublicAPI
def except_all(self, other: "DataFrame") -> "DataFrame":
"""
"""Returns the set difference of two DataFrames, considering duplicates.
Example:
>>> import daft
>>> df1 = daft.from_pydict({"a": [1, 1, 2, 2], "b": [4, 4, 6, 6]})
>>> df2 = daft.from_pydict({"a": [1, 2, 2], "b": [4, 6, 6]})
>>> df1.except_all(df2).collect()
╭───────┬───────╮
│ a ┆ b │
│ --- ┆ --- │
│ Int64 ┆ Int64 │
╞═══════╪═══════╡
│ 1 ┆ 4 │
╰───────┴───────╯
<BLANKLINE>
(Showing first 1 of 1 rows)
:param other:
:return:
Args:
other (DataFrame): DataFrame to except with
Returns:
DataFrame: DataFrame with the set difference of the two DataFrames, considering duplicates
"""
builder = self._builder.except_all(other._builder)
return DataFrame(builder)
Expand Down
12 changes: 11 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import uuid
from typing import Literal
from typing import Literal, Dict, Any

import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -183,6 +183,16 @@ def assert_df_equals(
print(f"Failed assertion for col: {col}")
raise

def check_answer(df: daft.DataFrame, expected_answer: Dict[str, Any], is_sorted: bool = False):
daft_df = df.to_pandas()
expected_df = daft.from_pydict(expected_answer).to_pandas()
# when this is an empty result, no need to check data types.
check_dtype = not expected_df.empty
if is_sorted:
assert_df_equals(daft_df, expected_df, assert_ordering=True, check_dtype=check_dtype)
else:
sort_keys = df.column_names
assert_df_equals(daft_df, expected_df, sort_key=sort_keys, assert_ordering=False, check_dtype=check_dtype)

@pytest.fixture(
scope="function",
Expand Down
199 changes: 81 additions & 118 deletions tests/dataframe/test_set_ops.py
Original file line number Diff line number Diff line change
@@ -1,122 +1,85 @@
from __future__ import annotations

import daft
from daft import col


def test_simple_intersect(make_df):
df1 = make_df({"foo": [1, 2, 3]})
df2 = make_df({"bar": [2, 3, 4]})
result = df1.intersect(df2)
assert result.to_pydict() == {"foo": [2, 3]}
df1 = make_df({"foo": [1, 2, 2]})
df2 = make_df({"bar": [2, 2, 4]})
result = df1.intersect_all(df2)
assert result.to_pydict() == {"foo": [2, 2]}


def test_simple_except(make_df):
df1 = make_df({"foo": [1, 2, 3]})
df2 = make_df({"bar": [2, 3, 4]})
result = df1.except_distinct(df2)
assert result.to_pydict() == {"foo": [1]}
df1 = make_df({"foo": [1, 1, 1, 2, 4, 4]})
df2 = make_df({"bar": [1, 2, 2, 4]})
result = df1.except_all(df2).sort(by="foo")
assert result.to_pydict() == {"foo": [1, 1, 4]}


def test_intersect_with_duplicate(make_df):
df1 = make_df({"foo": [1, 2, 2, 3]})
df2 = make_df({"bar": [2, 3, 3]})
result = df1.intersect(df2)
assert result.to_pydict() == {"foo": [2, 3]}
result = df1.intersect_all(df2)
assert result.to_pydict() == {"foo": [2, 3]}


def test_except_with_duplicate(make_df):
df1 = make_df({"foo": [1, 2, 2, 3]})
df2 = make_df({"bar": [2, 3, 3]})
result = df1.except_distinct(df2)
assert result.to_pydict() == {"foo": [1]}
result = df1.except_all(df2).sort(by="foo")
assert result.to_pydict() == {"foo": [1, 2]}


def test_self_intersect(make_df):
df = make_df({"foo": [1, 2, 3]})
result = df.intersect(df).sort(by="foo")
assert result.to_pydict() == {"foo": [1, 2, 3]}
result = df.intersect_all(df).sort(by="foo")
assert result.to_pydict() == {"foo": [1, 2, 3]}


def test_self_except(make_df):
df = make_df({"foo": [1, 2, 3]})
result = df.except_distinct(df).sort(by="foo")
assert result.to_pydict() == {"foo": []}
result = df.except_all(df).sort(by="foo")
assert result.to_pydict() == {"foo": []}

from typing import Any, Dict

def test_intersect_empty(make_df):
df1 = make_df({"foo": [1, 2, 3]})
df2 = make_df({"bar": []}).select(col("bar").cast(daft.DataType.int64()))
result = df1.intersect(df2)
assert result.to_pydict() == {"foo": []}
result = df1.intersect_all(df2)
assert result.to_pydict() == {"foo": []}


def test_except_empty(make_df):
df1 = make_df({"foo": [1, 2, 3]})
import daft
from daft import col, DataFrame
import pytest
from tests.conftest import check_answer

def helper(make_df, op: str, left: Dict[str, Any], right: Dict[str, Any], expected: Dict[str, Any]):
df1 = make_df(left)
df2 = make_df(right)
df_helper(op, df1, df2, expected)

def df_helper(op: str, df1: DataFrame, df2: DataFrame, expected: Dict[str, Any]):
if op == "intersect":
result = df1.intersect(df2)
elif op == "except_distinct":
result = df1.except_distinct(df2)
elif op == "intersect_all":
result = df1.intersect_all(df2)
else:
result = df1.except_all(df2)
check_answer(result, expected)

@pytest.mark.parametrize("op, left, right, expected", [
("intersect", {"foo": [1, 2, 3]}, {"bar": [2, 3, 4]}, {"foo": [2, 3]}),
("intersect_all", {"foo": [1, 2, 2]}, {"bar": [2, 2, 4]}, {"foo": [2, 2]}),
("except_distinct", {"foo": [1, 2, 3]}, {"bar": [2, 3, 4]}, {"foo": [1]}),
("except_all", {"foo": [1, 2, 2]}, {"bar": [2, 4]}, {"foo": [1, 2]}),
])
def test_simple_intersect_or_except(make_df, op, left, right, expected):
helper(make_df, op, left, right, expected)

@pytest.mark.parametrize("op, left, right, expected", [
("intersect", {"foo": [1, 2, 2, 3]}, {"bar": [2, 3, 3]}, {"foo": [2, 3]}),
("intersect_all", {"foo": [1, 2, 2, 3]}, {"bar": [2, 3, 3]}, {"foo": [2, 3]}),
("except_distinct", {"foo": [1, 2, 2, 3]}, {"bar": [2, 3, 3]}, {"foo": [1]}),
("except_all", {"foo": [1, 2, 2, 3]}, {"bar": [2, 3, 3]}, {"foo": [1, 2]}),
])
def test_with_duplicate(make_df, op, left, right, expected):
helper(make_df, op, left, right, expected)

@pytest.mark.parametrize("op, df, expected", [
("intersect", {"foo": [1, 2, 3]}, {"foo": [1, 2, 3]}),
("intersect_all", {"foo": [1, 2, 3]}, {"foo": [1, 2, 3]}),
("except_distinct", {"foo": [1, 2, 3]}, {"foo": []}),
("except_all", {"foo": [1, 2, 2]}, {"foo": []}),
])
def test_with_self(make_df, op, df, expected):
df = make_df(df)
df_helper(op, df, df, expected)

@pytest.mark.parametrize("op, left, expected", [
("intersect", {"foo": [1, 2, 3]}, {"foo": []}),
("intersect_all", {"foo": [1, 2, 3]}, {"foo": []}),
("except_distinct", {"foo": [1, 2, 3]}, {"foo": [1, 2, 3]}),
("except_all", {"foo": [1, 2, 2]}, {"foo": [1, 2, 2]}),
])
def test_with_empty(make_df, op, left, expected):
df1 = make_df(left)
df2 = make_df({"bar": []}).select(col("bar").cast(daft.DataType.int64()))
result = df1.except_distinct(df2).sort(by="foo")
assert result.to_pydict() == {"foo": [1, 2, 3]}
result = df1.except_all(df2).sort(by="foo")
assert result.to_pydict() == {"foo": [1, 2, 3]}


def test_intersect_with_nulls(make_df):
df1 = make_df({"foo": [1, 2, None]})
df1_without_mull = make_df({"foo": [1, 2]})
df2 = make_df({"bar": [2, 3, None]})
df2_without_null = make_df({"bar": [2, 3]})

result = df1.intersect(df2)
assert result.to_pydict() == {"foo": [2, None]}
result = df1.intersect_all(df2)
assert result.to_pydict() == {"foo": [2, None]}

result = df1_without_mull.intersect(df2)
assert result.to_pydict() == {"foo": [2]}
result = df1_without_mull.intersect_all(df2)
assert result.to_pydict() == {"foo": [2]}

result = df1.intersect(df2_without_null)
assert result.to_pydict() == {"foo": [2]}
result = df1.intersect_all(df2_without_null)
assert result.to_pydict() == {"foo": [2]}


def test_except_with_nulls(make_df):
df1 = make_df({"foo": [1, 2, None]})
df1_without_mull = make_df({"foo": [1, 2]})
df2 = make_df({"bar": [2, 3, None]})
df2_without_null = make_df({"bar": [2, 3]})

result = df1.except_distinct(df2)
assert result.to_pydict() == {"foo": [1]}
result = df1.except_all(df2)
assert result.to_pydict() == {"foo": [1]}

result = df1_without_mull.except_distinct(df2)
assert result.to_pydict() == {"foo": [1]}
result = df1_without_mull.except_all(df2)
assert result.to_pydict() == {"foo": [1]}

result = df1.except_distinct(df2_without_null)
assert result.to_pydict() == {"foo": [1, None]}
result = df1.except_all(df2_without_null)
assert result.to_pydict() == {"foo": [1, None]}
df_helper(op, df1, df2, expected)

@pytest.mark.parametrize("op, left, right, expected", [
("intersect", {"foo": [1, 2, None]}, {"foo": [2, 3, None]}, {"foo": [2, None]}),
("intersect_all", {"foo": [1, 2, None]}, {"foo": [2, 3, None]}, {"foo": [2, None]}),
("intersect", {"foo": [1, 2]}, {"foo": [2, 3, None]}, {"foo": [2]}),
("intersect_all", {"foo": [1, 2]}, {"foo": [2, 3, None]}, {"foo": [2]}),
("intersect", {"foo": [1, 2, None]}, {"foo": [2, 3]}, {"foo": [2]}),
("intersect_all", {"foo": [1, 2, None]}, {"foo": [2, 3]}, {"foo": [2]}),
])
def test_intersect_with_nulls(make_df, op, left, right, expected):
helper(make_df, op, left, right, expected)

@pytest.mark.parametrize("op, left, right, expected", [
("except_distinct", {"foo": [1, 2, None]}, {"foo": [2, 3, None]}, {"foo": [1]}),
("except_all", {"foo": [1, 2, None]}, {"foo": [2, 3, None]}, {"foo": [1]}),
("except_distinct", {"foo": [1, 2]}, {"foo": [2, 3, None]}, {"foo": [1]}),
("except_all", {"foo": [1, 2]}, {"foo": [2, 3, None]}, {"foo": [1]}),
("except_distinct", {"foo": [1, 2, None]}, {"foo": [2, 3]}, {"foo": [1, None]}),
("except_all", {"foo": [1, 2, None]}, {"foo": [2, 3]}, {"foo": [1, None]}),
])
def test_except_with_nulls(make_df, op, left, right, expected):
helper(make_df, op, left, right, expected)

0 comments on commit 0edf691

Please sign in to comment.