diff --git a/tests/dataframe/test_creation.py b/tests/dataframe/test_creation.py index ed93d18451..c178f01864 100644 --- a/tests/dataframe/test_creation.py +++ b/tests/dataframe/test_creation.py @@ -797,6 +797,41 @@ def test_create_dataframe_json_schema_hints_ignore_random_hint(valid_data: list[ assert len(pd_df) == len(valid_data) +def test_create_dataframe_json_schema_hints_large_file() -> None: + # First assemble data that will be larger than 1MB, because our schema inference will max out at 1MB. + item = {"column": {"test_key": "test_value"}} + item_size = len(json.dumps(item).encode("utf-8")) + entries_needed = (1 * 1024 * 1024) // item_size + 1 + data = [item] * entries_needed + + # Add a row at the end of the file with a different key to ensure that the schema inference doesn't pick it up + data.append({"column": {"TEST_KEY_BOTTOM_OF_FILE": "TEST_VALUE_BOTTOM_OF_FILE"}}) + + with create_temp_filename() as fname: + with open(fname, "w") as f: + for row in data: + f.write(json.dumps(row)) + f.write("\n") + f.flush() + + df = daft.read_json( + fname, + schema_hints={ + "column": DataType.struct({"test_key": DataType.string(), "TEST_KEY_BOTTOM_OF_FILE": DataType.string()}) + }, + ) + assert df.schema()["column"].dtype == DataType.struct( + {"test_key": DataType.string(), "TEST_KEY_BOTTOM_OF_FILE": DataType.string()} + ) + + # When dataframe is materialized, the schema hints should be enforced and the key value pair at the bottom should not be null + df = df.select(df["column"].struct.get("TEST_KEY_BOTTOM_OF_FILE")) + df = df.where(df["TEST_KEY_BOTTOM_OF_FILE"].not_null()).collect() + + assert len(df) == 1 + assert df.to_pydict()["TEST_KEY_BOTTOM_OF_FILE"][0] == "TEST_VALUE_BOTTOM_OF_FILE" + + @pytest.mark.parametrize( "input,expected", [