Skip to content

Commit

Permalink
[BUG]Fix missing columns after join (#2321)
Browse files Browse the repository at this point in the history
Closes #2300
  • Loading branch information
sherlockbeard authored May 29, 2024
1 parent 49e3234 commit e069c2d
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 1 deletion.
5 changes: 4 additions & 1 deletion src/daft-plan/src/logical_ops/join.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,16 @@ impl Join {
// but contains bug https://github.com/Eventual-Inc/Daft/issues/1294
let output_schema = {
let left_join_keys = left_on.iter().map(|e| e.name()).collect::<HashSet<_>>();
let right_join_keys = right_on.iter().map(|e| e.name()).collect::<HashSet<_>>();
let left_schema = &left.schema().fields;
let fields = left_schema
.iter()
.map(|(_, field)| field)
.cloned()
.chain(right.schema().fields.iter().filter_map(|(rname, rfield)| {
if left_join_keys.contains(rname.as_str()) {
if left_join_keys.contains(rname.as_str())
&& right_join_keys.contains(rname.as_str())
{
right_input_mapping.insert(rname.clone(), rname.clone());
None
} else if left_schema.contains_key(rname) {
Expand Down
17 changes: 17 additions & 0 deletions tests/dataframe/test_joins.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,23 @@ def test_invalid_join_strategies(make_df):
df.join(df, on="A", strategy="broadcast", how="outer")


def test_columns_after_join(make_df):
df1 = make_df(
{
"A": [1, 2, 3],
},
)

df2 = make_df({"A": [1, 2, 3], "B": [1, 2, 3]})

joined_df1 = df1.join(df2, left_on="A", right_on="B")
joined_df2 = df1.join(df2, left_on="A", right_on="A")

assert set(joined_df1.schema().column_names()) == set(["A", "B", "right.A"])

assert set(joined_df2.schema().column_names()) == set(["A", "B"])


@pytest.mark.parametrize("n_partitions", [1, 2, 4])
@pytest.mark.parametrize(
"join_strategy",
Expand Down

0 comments on commit e069c2d

Please sign in to comment.