Skip to content

Commit

Permalink
[BUG] Schema hints not working properly for json reads (#1845)
Browse files Browse the repository at this point in the history
Schema hints were not being propagated, leading to fields being dropped.

This stemmed from an issue when reading large jsons from s3 where the
fields changed only late into the file, so schema inference doesn't pick
it up.
  • Loading branch information
colin-ho authored Feb 9, 2024
1 parent ed8daaf commit 573ea12
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/daft-micropartition/src/micropartition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ fn materialize_scan_task(
column_names
.as_ref()
.map(|cols| cols.iter().map(|col| col.to_string()).collect()),
None,
Some(scan_task.schema.clone()),
scan_task.pushdowns.filters.clone(),
);
let parse_options = JsonParseOptions::new_internal();
Expand Down
30 changes: 30 additions & 0 deletions tests/dataframe/test_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,36 @@ def test_create_dataframe_json_schema_hints_ignore_random_hint(valid_data: list[
assert len(pd_df) == len(valid_data)


def test_create_dataframe_json_schema_hints_two_files() -> None:
with create_temp_filename() as fname, create_temp_filename() as fname2:
with open(fname, "w") as f:
f.write(json.dumps({"foo": {"bar": "baz"}}))
f.write("\n")
f.flush()

with open(fname2, "w") as f:
f.write(json.dumps({"foo": {"bar2": "baz2"}}))
f.write("\n")
f.flush()

# Without schema hints, schema inference should not pick up bar2
assert daft.read_json([fname, fname2]).schema()["foo"].dtype == DataType.struct({"bar": DataType.string()})

# With schema hints, bar2 should be included
df = daft.read_json(
[fname, fname2],
schema_hints={"foo": DataType.struct({"bar": DataType.string(), "bar2": DataType.string()})},
)
assert df.schema()["foo"].dtype == DataType.struct({"bar": DataType.string(), "bar2": DataType.string()})

# When dataframe is materialized, the schema hints should be enforced and bar2 should be included
df = df.select(df["foo"].struct.get("bar2"))
df = df.where(df["bar2"].not_null()).collect()

assert len(df) == 1
assert df.to_pydict()["bar2"][0] == "baz2"


@pytest.mark.parametrize(
"input,expected",
[
Expand Down

0 comments on commit 573ea12

Please sign in to comment.