Skip to content

Commit

Permalink
unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
colin-ho committed Feb 7, 2024
1 parent 1d81c4d commit e58d3ef
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions tests/dataframe/test_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,41 @@ def test_create_dataframe_json_schema_hints_ignore_random_hint(valid_data: list[
assert len(pd_df) == len(valid_data)


def test_create_dataframe_json_schema_hints_large_file() -> None:
# First assemble data that will be larger than 1MB, because our schema inference will max out at 1MB.
item = {"column": {"test_key": "test_value"}}
item_size = len(json.dumps(item).encode("utf-8"))
entries_needed = (1 * 1024 * 1024) // item_size + 1
data = [item] * entries_needed

# Add a row at the end of the file with a different key to ensure that the schema inference doesn't pick it up
data.append({"column": {"TEST_KEY_BOTTOM_OF_FILE": "TEST_VALUE_BOTTOM_OF_FILE"}})

with create_temp_filename() as fname:
with open(fname, "w") as f:
for row in data:
f.write(json.dumps(row))
f.write("\n")
f.flush()

df = daft.read_json(
fname,
schema_hints={
"column": DataType.struct({"test_key": DataType.string(), "TEST_KEY_BOTTOM_OF_FILE": DataType.string()})
},
)
assert df.schema()["column"].dtype == DataType.struct(
{"test_key": DataType.string(), "TEST_KEY_BOTTOM_OF_FILE": DataType.string()}
)

# When dataframe is materialized, the schema hints should be enforced and the key value pair at the bottom should not be null
df = df.select(df["column"].struct.get("TEST_KEY_BOTTOM_OF_FILE"))
df = df.where(df["TEST_KEY_BOTTOM_OF_FILE"].not_null()).collect()

assert len(df) == 1
assert df.to_pydict()["TEST_KEY_BOTTOM_OF_FILE"][0] == "TEST_VALUE_BOTTOM_OF_FILE"


@pytest.mark.parametrize(
"input,expected",
[
Expand Down

0 comments on commit e58d3ef

Please sign in to comment.