unit test

Eventual-Inc · Feb 7, 2024 · e58d3ef · e58d3ef
1 parent 1d81c4d
commit e58d3ef
Showing 1 changed file with 35 additions and 0 deletions.
diff --git a/tests/dataframe/test_creation.py b/tests/dataframe/test_creation.py
@@ -797,6 +797,41 @@ def test_create_dataframe_json_schema_hints_ignore_random_hint(valid_data: list[
         assert len(pd_df) == len(valid_data)
 
 
+def test_create_dataframe_json_schema_hints_large_file() -> None:
+    # First assemble data that will be larger than 1MB, because our schema inference will max out at 1MB.
+    item = {"column": {"test_key": "test_value"}}
+    item_size = len(json.dumps(item).encode("utf-8"))
+    entries_needed = (1 * 1024 * 1024) // item_size + 1
+    data = [item] * entries_needed
+
+    # Add a row at the end of the file with a different key to ensure that the schema inference doesn't pick it up
+    data.append({"column": {"TEST_KEY_BOTTOM_OF_FILE": "TEST_VALUE_BOTTOM_OF_FILE"}})
+
+    with create_temp_filename() as fname:
+        with open(fname, "w") as f:
+            for row in data:
+                f.write(json.dumps(row))
+                f.write("\n")
+            f.flush()
+
+        df = daft.read_json(
+            fname,
+            schema_hints={
+                "column": DataType.struct({"test_key": DataType.string(), "TEST_KEY_BOTTOM_OF_FILE": DataType.string()})
+            },
+        )
+        assert df.schema()["column"].dtype == DataType.struct(
+            {"test_key": DataType.string(), "TEST_KEY_BOTTOM_OF_FILE": DataType.string()}
+        )
+
+        # When dataframe is materialized, the schema hints should be enforced and the key value pair at the bottom should not be null
+        df = df.select(df["column"].struct.get("TEST_KEY_BOTTOM_OF_FILE"))
+        df = df.where(df["TEST_KEY_BOTTOM_OF_FILE"].not_null()).collect()
+
+        assert len(df) == 1
+        assert df.to_pydict()["TEST_KEY_BOTTOM_OF_FILE"][0] == "TEST_VALUE_BOTTOM_OF_FILE"
+
+
 @pytest.mark.parametrize(
     "input,expected",
     [