From 3693c2285071e7fd8e16e78ce90c02ea359921da Mon Sep 17 00:00:00 2001
From: Clark Zinzow <clarkzinzow@gmail.com>
Date: Tue, 5 Dec 2023 18:25:38 -0800
Subject: [PATCH] [FEAT] [JSON Reader] Add native streaming + parallel JSON
 reader. (#1679)

This PR adds a streaming + parallel JSON reader, with full support for
most fundamental dtypes (sans decimal and binary types), arbitrary
nesting with JSON lists and objects, including nulls at all levels of
the JSON object tree.

## TODOs

- [x] Add schema inference unit test for dtype coverage (i.e. reading
the `dtypes.jsonl` file).
- [x] Add temporal type inference + parsing test coverage.
- [x] Benchmarking + performance audit: this reader follows the same
general concurrency + parallelism model of the streaming CSV reader,
which performs relatively well for cloud reads, but there's bound to be
a lot of low-hanging fruit around unnecessary copies.
- [ ] (Follow-up?) Add thorough parsing and dtype inference unit tests
on in-memory defined JSON strings.
- [ ] (Follow-up) Support for decimal and (large) binary types.
- [ ] (Follow-up) Add support for strict parsing, i.e. returning an
error instead of falling back to a null value when parsing fails.
- [ ] (Follow-up) Misc. bugs in Arrow2 that should be fixed and
upstreamed.
- [ ] (Follow-up) Deflate compression support.
---
 Cargo.lock                                    |   88 +-
 Cargo.toml                                    |    8 +-
 daft/daft.pyi                                 |   69 +
 daft/execution/execution_step.py              |    4 +
 daft/io/_json.py                              |   13 +-
 daft/logical/schema.py                        |   20 +-
 daft/table/micropartition.py                  |   24 +
 daft/table/schema_inference.py                |   12 +-
 daft/table/table.py                           |   34 +-
 daft/table/table_io.py                        |   32 +-
 src/daft-compression/Cargo.toml               |    9 +
 src/daft-compression/src/compression.rs       |   66 +
 src/daft-compression/src/lib.rs               |    4 +
 src/daft-csv/Cargo.toml                       |    1 +
 src/daft-csv/src/lib.rs                       |    1 -
 src/daft-csv/src/metadata.rs                  |    3 +-
 src/daft-csv/src/read.rs                      |  113 +-
 src/daft-decoding/Cargo.toml                  |    3 +
 src/daft-decoding/src/deserialize.rs          |   34 +-
 src/daft-decoding/src/inference.rs            |   49 +-
 src/daft-json/Cargo.toml                      |   48 +
 src/daft-json/src/decoding.rs                 |  537 ++++++++
 src/daft-json/src/inference.rs                |  233 ++++
 src/daft-json/src/lib.rs                      |   73 +
 src/daft-json/src/options.rs                  |  212 +++
 src/daft-json/src/python.rs                   |   70 +
 src/daft-json/src/read.rs                     | 1218 +++++++++++++++++
 src/daft-json/src/schema.rs                   |  449 ++++++
 src/daft-json/test/dtypes.jsonl               |    4 +
 src/daft-json/test/iris_tiny.jsonl            |   20 +
 src/daft-json/test/iris_tiny.jsonl.br         |  Bin 0 -> 176 bytes
 src/daft-json/test/iris_tiny.jsonl.bz2        |  Bin 0 -> 228 bytes
 src/daft-json/test/iris_tiny.jsonl.gz         |  Bin 0 -> 258 bytes
 src/daft-json/test/iris_tiny.jsonl.lzma       |  Bin 0 -> 202 bytes
 src/daft-json/test/iris_tiny.jsonl.xz         |  Bin 0 -> 248 bytes
 src/daft-json/test/iris_tiny.jsonl.zl         |  Bin 0 -> 230 bytes
 src/daft-json/test/iris_tiny.jsonl.zst        |  Bin 0 -> 228 bytes
 .../test/iris_tiny_all_null_column.jsonl      |    6 +
 .../test/iris_tiny_conflicting_dtypes.jsonl   |    2 +
 src/daft-json/test/iris_tiny_nulls.jsonl      |    6 +
 src/daft-micropartition/Cargo.toml            |    1 +
 src/daft-micropartition/src/micropartition.rs |   78 +-
 src/daft-micropartition/src/python.rs         |   28 +
 src/daft-plan/src/optimization/optimizer.rs   |    8 +-
 .../optimization/rules/drop_repartition.rs    |    2 +-
 .../optimization/rules/push_down_filter.rs    |   30 +-
 .../src/optimization/rules/push_down_limit.rs |   23 +-
 .../rules/push_down_projection.rs             |   14 +-
 src/daft-plan/src/test/mod.rs                 |   22 +-
 src/daft-scan/Cargo.toml                      |    1 +
 src/daft-scan/src/file_format.rs              |   29 +-
 src/daft-scan/src/glob.rs                     |   32 +-
 src/daft-scan/src/python.rs                   |   12 -
 src/lib.rs                                    |    1 +
 tests/dataframe/test_creation.py              |   27 +-
 tests/table/table_io/test_json.py             |  152 +-
 56 files changed, 3657 insertions(+), 268 deletions(-)
 create mode 100644 src/daft-compression/Cargo.toml
 create mode 100644 src/daft-compression/src/compression.rs
 create mode 100644 src/daft-compression/src/lib.rs
 create mode 100644 src/daft-json/Cargo.toml
 create mode 100644 src/daft-json/src/decoding.rs
 create mode 100644 src/daft-json/src/inference.rs
 create mode 100644 src/daft-json/src/lib.rs
 create mode 100644 src/daft-json/src/options.rs
 create mode 100644 src/daft-json/src/python.rs
 create mode 100644 src/daft-json/src/read.rs
 create mode 100644 src/daft-json/src/schema.rs
 create mode 100644 src/daft-json/test/dtypes.jsonl
 create mode 100644 src/daft-json/test/iris_tiny.jsonl
 create mode 100644 src/daft-json/test/iris_tiny.jsonl.br
 create mode 100644 src/daft-json/test/iris_tiny.jsonl.bz2
 create mode 100644 src/daft-json/test/iris_tiny.jsonl.gz
 create mode 100644 src/daft-json/test/iris_tiny.jsonl.lzma
 create mode 100644 src/daft-json/test/iris_tiny.jsonl.xz
 create mode 100644 src/daft-json/test/iris_tiny.jsonl.zl
 create mode 100644 src/daft-json/test/iris_tiny.jsonl.zst
 create mode 100644 src/daft-json/test/iris_tiny_all_null_column.jsonl
 create mode 100644 src/daft-json/test/iris_tiny_conflicting_dtypes.jsonl
 create mode 100644 src/daft-json/test/iris_tiny_nulls.jsonl

diff --git a/Cargo.lock b/Cargo.lock
index d1c64e69b2..cccda23a76 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -104,8 +104,8 @@ dependencies = [
 
 [[package]]
 name = "arrow2"
-version = "0.17.1"
-source = "git+https://github.com/Eventual-Inc/arrow2?rev=3a23c780d4d59ef0c9c8751675480e07f4e1c311#3a23c780d4d59ef0c9c8751675480e07f4e1c311"
+version = "0.17.4"
+source = "git+https://github.com/Eventual-Inc/arrow2?rev=d5685eebf1d65c3f3d854370ad39f93dcd91971a#d5685eebf1d65c3f3d854370ad39f93dcd91971a"
 dependencies = [
  "ahash",
  "arrow-format",
@@ -124,7 +124,9 @@ dependencies = [
  "futures",
  "getrandom 0.2.10",
  "hash_hasher",
- "hashbrown 0.13.2",
+ "hashbrown 0.14.1",
+ "indexmap 1.9.3",
+ "json-deserializer 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
  "lexical-core",
  "multiversion",
  "num-traits",
@@ -1099,10 +1101,12 @@ name = "daft"
 version = "0.1.10"
 dependencies = [
  "common-daft-config",
+ "daft-compression",
  "daft-core",
  "daft-csv",
  "daft-dsl",
  "daft-io",
+ "daft-json",
  "daft-micropartition",
  "daft-parquet",
  "daft-plan",
@@ -1115,6 +1119,15 @@ dependencies = [
  "tikv-jemallocator",
 ]
 
+[[package]]
+name = "daft-compression"
+version = "0.1.10"
+dependencies = [
+ "async-compression",
+ "tokio",
+ "url",
+]
+
 [[package]]
 name = "daft-core"
 version = "0.1.10"
@@ -1159,6 +1172,7 @@ dependencies = [
  "chrono-tz",
  "common-error",
  "csv-async",
+ "daft-compression",
  "daft-core",
  "daft-decoding",
  "daft-io",
@@ -1184,11 +1198,14 @@ name = "daft-decoding"
 version = "0.1.10"
 dependencies = [
  "arrow2",
+ "async-compression",
  "chrono",
  "chrono-tz",
  "csv-async",
  "lexical-core",
  "simdutf8",
+ "tokio",
+ "url",
 ]
 
 [[package]]
@@ -1252,6 +1269,43 @@ dependencies = [
  "url",
 ]
 
+[[package]]
+name = "daft-json"
+version = "0.1.10"
+dependencies = [
+ "arrow2",
+ "async-compat",
+ "async-compression",
+ "async-stream",
+ "bincode",
+ "bytes",
+ "chrono",
+ "chrono-tz",
+ "common-error",
+ "daft-compression",
+ "daft-core",
+ "daft-decoding",
+ "daft-io",
+ "daft-table",
+ "futures",
+ "indexmap 2.1.0",
+ "json-deserializer 0.4.4 (git+https://github.com/Eventual-Inc/json-deserializer?rev=4be9205900888ff0767e0c37642884f2c912e22c)",
+ "lexical-core",
+ "log",
+ "num-traits",
+ "pyo3",
+ "pyo3-log",
+ "rayon",
+ "rstest",
+ "serde",
+ "simdutf8",
+ "snafu",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "url",
+]
+
 [[package]]
 name = "daft-micropartition"
 version = "0.1.10"
@@ -1263,6 +1317,7 @@ dependencies = [
  "daft-csv",
  "daft-dsl",
  "daft-io",
+ "daft-json",
  "daft-parquet",
  "daft-scan",
  "daft-stats",
@@ -1338,6 +1393,7 @@ dependencies = [
  "daft-csv",
  "daft-dsl",
  "daft-io",
+ "daft-json",
  "daft-parquet",
  "daft-stats",
  "daft-table",
@@ -1829,17 +1885,14 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
 
-[[package]]
-name = "hashbrown"
-version = "0.13.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
-
 [[package]]
 name = "hashbrown"
 version = "0.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7dfda62a12f55daeae5015f81b0baea145391cb4520f86c248fc615d72640d12"
+dependencies = [
+ "ahash",
+]
 
 [[package]]
 name = "heck"
@@ -2120,6 +2173,23 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "json-deserializer"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f63b421e16eb4100beb677af56f0b4f3a4f08bab74ef2af079ce5bb92c2683f"
+dependencies = [
+ "indexmap 1.9.3",
+]
+
+[[package]]
+name = "json-deserializer"
+version = "0.4.4"
+source = "git+https://github.com/Eventual-Inc/json-deserializer?rev=4be9205900888ff0767e0c37642884f2c912e22c#4be9205900888ff0767e0c37642884f2c912e22c"
+dependencies = [
+ "indexmap 2.1.0",
+]
+
 [[package]]
 name = "jsonwebtoken"
 version = "8.3.0"
diff --git a/Cargo.toml b/Cargo.toml
index ba30d0ca8a..4e8f5450f1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,9 +1,11 @@
 [dependencies]
 common-daft-config = {path = "src/common/daft-config", default-features = false}
+daft-compression = {path = "src/daft-compression", default-features = false}
 daft-core = {path = "src/daft-core", default-features = false}
 daft-csv = {path = "src/daft-csv", default-features = false}
 daft-dsl = {path = "src/daft-dsl", default-features = false}
 daft-io = {path = "src/daft-io", default-features = false}
+daft-json = {path = "src/daft-json", default-features = false}
 daft-micropartition = {path = "src/daft-micropartition", default-features = false}
 daft-parquet = {path = "src/daft-parquet", default-features = false}
 daft-plan = {path = "src/daft-plan", default-features = false}
@@ -25,6 +27,7 @@ python = [
   "daft-plan/python",
   "daft-parquet/python",
   "daft-csv/python",
+  "daft-json/python",
   "daft-micropartition/python",
   "daft-scan/python",
   "daft-stats/python",
@@ -76,6 +79,7 @@ members = [
   "src/daft-io",
   "src/daft-parquet",
   "src/daft-csv",
+  "src/daft-json",
   "src/daft-dsl",
   "src/daft-table",
   "src/daft-plan",
@@ -108,10 +112,10 @@ tokio-util = "0.7.8"
 url = "2.4.0"
 
 [workspace.dependencies.arrow2]
-# branch = "jay/fix-parquet-timezone-parsing"
+# branch = "daft-fork"
 git = "https://github.com/Eventual-Inc/arrow2"
 package = "arrow2"
-rev = "3a23c780d4d59ef0c9c8751675480e07f4e1c311"
+rev = "d5685eebf1d65c3f3d854370ad39f93dcd91971a"
 
 [workspace.dependencies.bincode]
 version = "1.3.3"
diff --git a/daft/daft.pyi b/daft/daft.pyi
index bb037b955a..0128dedafe 100644
--- a/daft/daft.pyi
+++ b/daft/daft.pyi
@@ -211,6 +211,15 @@ class JsonSourceConfig:
     Configuration of a JSON data source.
     """
 
+    buffer_size: int | None
+    chunk_size: int | None
+
+    def __init__(
+        self,
+        buffer_size: int | None = None,
+        chunk_size: int | None = None,
+    ): ...
+
 class FileFormatConfig:
     """
     Configuration for parsing a particular file format (Parquet, CSV, JSON).
@@ -298,6 +307,41 @@ class CsvReadOptions:
         chunk_size: int | None = None,
     ): ...
 
+class JsonConvertOptions:
+    """
+    Options for converting JSON data to Daft data.
+    """
+
+    limit: int | None
+    include_columns: list[str] | None
+    schema: PySchema | None
+
+    def __init__(
+        self,
+        limit: int | None = None,
+        include_columns: list[str] | None = None,
+        schema: PySchema | None = None,
+    ): ...
+
+class JsonParseOptions:
+    """
+    Options for parsing JSON files.
+    """
+
+class JsonReadOptions:
+    """
+    Options for reading JSON files.
+    """
+
+    buffer_size: int | None
+    chunk_size: int | None
+
+    def __init__(
+        self,
+        buffer_size: int | None = None,
+        chunk_size: int | None = None,
+    ): ...
+
 class FileInfo:
     """
     Metadata for a single file.
@@ -587,6 +631,21 @@ def read_csv_schema(
     io_config: IOConfig | None = None,
     multithreaded_io: bool | None = None,
 ): ...
+def read_json(
+    uri: str,
+    convert_options: JsonConvertOptions | None = None,
+    parse_options: JsonParseOptions | None = None,
+    read_options: JsonReadOptions | None = None,
+    io_config: IOConfig | None = None,
+    multithreaded_io: bool | None = None,
+    max_chunks_in_flight: int | None = None,
+): ...
+def read_json_schema(
+    uri: str,
+    parse_options: JsonParseOptions | None = None,
+    io_config: IOConfig | None = None,
+    multithreaded_io: bool | None = None,
+): ...
 
 class PyTimeUnit:
     @staticmethod
@@ -931,6 +990,16 @@ class PyMicroPartition:
         io_config: IOConfig | None = None,
         multithreaded_io: bool | None = None,
     ): ...
+    @classmethod
+    def read_json_native(
+        cls,
+        uri: str,
+        convert_options: JsonConvertOptions | None = None,
+        parse_options: JsonParseOptions | None = None,
+        read_options: JsonReadOptions | None = None,
+        io_config: IOConfig | None = None,
+        multithreaded_io: bool | None = None,
+    ): ...
 
 class PhysicalPlanScheduler:
     """
diff --git a/daft/execution/execution_step.py b/daft/execution/execution_step.py
index 59c613f360..672aa0def7 100644
--- a/daft/execution/execution_step.py
+++ b/daft/execution/execution_step.py
@@ -17,6 +17,7 @@
     FileFormatConfig,
     IOConfig,
     JoinType,
+    JsonReadOptions,
     JsonSourceConfig,
     ParquetSourceConfig,
     ResourceRequest,
@@ -370,6 +371,9 @@ def _handle_tabular_files_scan(
                         file=fp,
                         schema=self.schema,
                         storage_config=self.storage_config,
+                        json_read_options=JsonReadOptions(
+                            buffer_size=format_config.buffer_size, chunk_size=format_config.chunk_size
+                        ),
                         read_options=read_options,
                     )
                     for fp in filepaths
diff --git a/daft/io/_json.py b/daft/io/_json.py
index 05d519cbe7..cb40bbea4c 100644
--- a/daft/io/_json.py
+++ b/daft/io/_json.py
@@ -7,6 +7,7 @@
     FileFormatConfig,
     IOConfig,
     JsonSourceConfig,
+    NativeStorageConfig,
     PythonStorageConfig,
     StorageConfig,
 )
@@ -20,6 +21,9 @@ def read_json(
     path: Union[str, List[str]],
     schema_hints: Optional[Dict[str, DataType]] = None,
     io_config: Optional["IOConfig"] = None,
+    use_native_downloader: bool = True,
+    _buffer_size: Optional[int] = None,
+    _chunk_size: Optional[int] = None,
 ) -> DataFrame:
     """Creates a DataFrame from line-delimited JSON file(s)
 
@@ -34,6 +38,8 @@ def read_json(
         schema_hints (dict[str, DataType]): A mapping between column names and datatypes - passing this option will
             disable all schema inference on data being read, and throw an error if data being read is incompatible.
         io_config (IOConfig): Config to be used with the native downloader
+        use_native_downloader: Whether to use the native downloader instead of PyArrow for reading Parquet. This
+            is currently experimental.
 
     returns:
         DataFrame: parsed DataFrame
@@ -41,8 +47,11 @@ def read_json(
     if isinstance(path, list) and len(path) == 0:
         raise ValueError(f"Cannot read DataFrame from from empty list of JSON filepaths")
 
-    json_config = JsonSourceConfig()
+    json_config = JsonSourceConfig(_buffer_size, _chunk_size)
     file_format_config = FileFormatConfig.from_json_config(json_config)
-    storage_config = StorageConfig.python(PythonStorageConfig(io_config=io_config))
+    if use_native_downloader:
+        storage_config = StorageConfig.native(NativeStorageConfig(True, io_config))
+    else:
+        storage_config = StorageConfig.python(PythonStorageConfig(io_config=io_config))
     builder = _get_tabular_files_scan(path, schema_hints, file_format_config, storage_config=storage_config)
     return DataFrame(builder)
diff --git a/daft/logical/schema.py b/daft/logical/schema.py
index 47c6d32569..c0a5455bd7 100644
--- a/daft/logical/schema.py
+++ b/daft/logical/schema.py
@@ -3,10 +3,11 @@
 import sys
 from typing import TYPE_CHECKING, Iterator
 
-from daft.daft import CsvParseOptions
+from daft.daft import CsvParseOptions, JsonParseOptions
 from daft.daft import PyField as _PyField
 from daft.daft import PySchema as _PySchema
 from daft.daft import read_csv_schema as _read_csv_schema
+from daft.daft import read_json_schema as _read_json_schema
 from daft.daft import read_parquet_schema as _read_parquet_schema
 from daft.datatype import DataType, TimeUnit
 
@@ -183,3 +184,20 @@ def from_csv(
                 multithreaded_io=multithreaded_io,
             )
         )
+
+    @classmethod
+    def from_json(
+        cls,
+        path: str,
+        parse_options: JsonParseOptions | None = None,
+        io_config: IOConfig | None = None,
+        multithreaded_io: bool | None = None,
+    ) -> Schema:
+        return Schema._from_pyschema(
+            _read_json_schema(
+                uri=path,
+                parse_options=parse_options,
+                io_config=io_config,
+                multithreaded_io=multithreaded_io,
+            )
+        )
diff --git a/daft/table/micropartition.py b/daft/table/micropartition.py
index f931fda2bd..1ebccb51d0 100644
--- a/daft/table/micropartition.py
+++ b/daft/table/micropartition.py
@@ -11,6 +11,9 @@
     CsvReadOptions,
     IOConfig,
     JoinType,
+    JsonConvertOptions,
+    JsonParseOptions,
+    JsonReadOptions,
 )
 from daft.daft import PyMicroPartition as _PyMicroPartition
 from daft.daft import PyTable as _PyTable
@@ -374,3 +377,24 @@ def read_csv(
                 multithreaded_io=multithreaded_io,
             )
         )
+
+    @classmethod
+    def read_json(
+        cls,
+        path: str,
+        convert_options: JsonConvertOptions | None = None,
+        parse_options: JsonParseOptions | None = None,
+        read_options: JsonReadOptions | None = None,
+        io_config: IOConfig | None = None,
+        multithreaded_io: bool | None = None,
+    ) -> MicroPartition:
+        return MicroPartition._from_pymicropartition(
+            _PyMicroPartition.read_json_native(
+                uri=path,
+                convert_options=convert_options,
+                parse_options=parse_options,
+                read_options=read_options,
+                io_config=io_config,
+                multithreaded_io=multithreaded_io,
+            )
+        )
diff --git a/daft/table/schema_inference.py b/daft/table/schema_inference.py
index dce4ca4f18..7be12c705c 100644
--- a/daft/table/schema_inference.py
+++ b/daft/table/schema_inference.py
@@ -8,6 +8,7 @@
 
 from daft.daft import (
     CsvParseOptions,
+    JsonParseOptions,
     NativeStorageConfig,
     PythonStorageConfig,
     StorageConfig,
@@ -90,7 +91,16 @@ def from_json(
     io_config = None
     if storage_config is not None:
         config = storage_config.config
-        assert isinstance(config, PythonStorageConfig), "JSON schema inference only supports PythonStorageConfig"
+        if isinstance(config, NativeStorageConfig):
+            assert isinstance(file, (str, pathlib.Path)), "Native downloader only works on string inputs to read_json"
+            io_config = config.io_config
+            return Schema.from_json(
+                str(file),
+                parse_options=JsonParseOptions(),
+                io_config=io_config,
+            )
+
+        assert isinstance(config, PythonStorageConfig)
         io_config = config.io_config
 
     with _open_stream(file, io_config) as f:
diff --git a/daft/table/table.py b/daft/table/table.py
index 1669170e23..6b87c2ee03 100644
--- a/daft/table/table.py
+++ b/daft/table/table.py
@@ -6,10 +6,19 @@
 import pyarrow as pa
 
 from daft.arrow_utils import ensure_table
-from daft.daft import CsvConvertOptions, CsvParseOptions, CsvReadOptions, JoinType
+from daft.daft import (
+    CsvConvertOptions,
+    CsvParseOptions,
+    CsvReadOptions,
+    JoinType,
+    JsonConvertOptions,
+    JsonParseOptions,
+    JsonReadOptions,
+)
 from daft.daft import PyTable as _PyTable
 from daft.daft import ScanTask as _ScanTask
 from daft.daft import read_csv as _read_csv
+from daft.daft import read_json as _read_json
 from daft.daft import read_parquet as _read_parquet
 from daft.daft import read_parquet_bulk as _read_parquet_bulk
 from daft.daft import read_parquet_into_pyarrow as _read_parquet_into_pyarrow
@@ -465,6 +474,29 @@ def read_csv(
             )
         )
 
+    @classmethod
+    def read_json(
+        cls,
+        path: str,
+        convert_options: JsonConvertOptions | None = None,
+        parse_options: JsonParseOptions | None = None,
+        read_options: JsonReadOptions | None = None,
+        io_config: IOConfig | None = None,
+        multithreaded_io: bool | None = None,
+        max_chunks_in_flight: int | None = None,
+    ) -> Table:
+        return Table._from_pytable(
+            _read_json(
+                uri=path,
+                convert_options=convert_options,
+                parse_options=parse_options,
+                read_options=read_options,
+                io_config=io_config,
+                multithreaded_io=multithreaded_io,
+                max_chunks_in_flight=max_chunks_in_flight,
+            )
+        )
+
 
 def _trim_pyarrow_large_arrays(arr: pa.ChunkedArray) -> pa.ChunkedArray:
     if pa.types.is_large_binary(arr.type) or pa.types.is_large_string(arr.type):
diff --git a/daft/table/table_io.py b/daft/table/table_io.py
index 82f48b733c..6774ae5175 100644
--- a/daft/table/table_io.py
+++ b/daft/table/table_io.py
@@ -17,6 +17,9 @@
     CsvParseOptions,
     CsvReadOptions,
     IOConfig,
+    JsonConvertOptions,
+    JsonParseOptions,
+    JsonReadOptions,
     NativeStorageConfig,
     PythonStorageConfig,
     StorageConfig,
@@ -74,6 +77,7 @@ def read_json(
     file: FileInput,
     schema: Schema,
     storage_config: StorageConfig | None = None,
+    json_read_options: JsonReadOptions | None = None,
     read_options: TableReadOptions = TableReadOptions(),
 ) -> MicroPartition:
     """Reads a MicroPartition from a JSON file
@@ -82,7 +86,8 @@ def read_json(
         file (str | IO): either a file-like object or a string file path (potentially prefixed with a protocol such as "s3://")
         fs (fsspec.AbstractFileSystem): fsspec FileSystem to use for reading data.
             By default, Daft will automatically construct a FileSystem instance internally.
-        read_options (TableReadOptions, optional): Options for reading the file
+        json_read_options (JsonReadOptions, optional): JSON-specific configs to apply when reading the file
+        read_options (TableReadOptions, optional): Non-format-specific options for reading the file
 
     Returns:
         MicroPartition: Parsed MicroPartition from JSON
@@ -90,8 +95,25 @@ def read_json(
     io_config = None
     if storage_config is not None:
         config = storage_config.config
-        assert isinstance(config, PythonStorageConfig), "JSON reads only supports PyStorageConfig"
-        io_config = config.io_config
+        if isinstance(config, NativeStorageConfig):
+            assert isinstance(file, (str, pathlib.Path)), "Native downloader only works on string inputs to read_json"
+            json_convert_options = JsonConvertOptions(
+                limit=read_options.num_rows,
+                include_columns=read_options.column_names,
+                schema=schema._schema if schema is not None else None,
+            )
+            json_parse_options = JsonParseOptions()
+            tbl = MicroPartition.read_json(
+                str(file),
+                convert_options=json_convert_options,
+                parse_options=json_parse_options,
+                read_options=json_read_options,
+                io_config=config.io_config,
+            )
+            return _cast_table_to_schema(tbl, read_options=read_options, schema=schema)
+        else:
+            assert isinstance(config, PythonStorageConfig)
+            io_config = config.io_config
 
     with _open_stream(file, io_config) as f:
         table = pajson.read_json(f)
@@ -130,7 +152,7 @@ def read_parquet(
         if isinstance(config, NativeStorageConfig):
             assert isinstance(
                 file, (str, pathlib.Path)
-            ), "Native downloader only works on string inputs to read_parquet"
+            ), "Native downloader only works on string or Path inputs to read_parquet"
             tbl = MicroPartition.read_parquet(
                 str(file),
                 columns=read_options.column_names,
@@ -225,7 +247,7 @@ def read_csv(
         if isinstance(config, NativeStorageConfig):
             assert isinstance(
                 file, (str, pathlib.Path)
-            ), "Native downloader only works on string inputs to read_parquet"
+            ), "Native downloader only works on string or Path inputs to read_csv"
             has_header = csv_options.header_index is not None
             csv_convert_options = CsvConvertOptions(
                 limit=read_options.num_rows,
diff --git a/src/daft-compression/Cargo.toml b/src/daft-compression/Cargo.toml
new file mode 100644
index 0000000000..6b695535e5
--- /dev/null
+++ b/src/daft-compression/Cargo.toml
@@ -0,0 +1,9 @@
+[dependencies]
+async-compression = {workspace = true}
+tokio = {workspace = true}
+url = {workspace = true}
+
+[package]
+edition = {workspace = true}
+name = "daft-compression"
+version = {workspace = true}
diff --git a/src/daft-compression/src/compression.rs b/src/daft-compression/src/compression.rs
new file mode 100644
index 0000000000..268b1566d9
--- /dev/null
+++ b/src/daft-compression/src/compression.rs
@@ -0,0 +1,66 @@
+use async_compression::tokio::bufread::{
+    BrotliDecoder, BzDecoder, DeflateDecoder, GzipDecoder, LzmaDecoder, XzDecoder, ZlibDecoder,
+    ZstdDecoder,
+};
+use std::{path::PathBuf, pin::Pin};
+use tokio::io::{AsyncBufRead, AsyncRead};
+use url::Url;
+
+#[derive(Debug)]
+pub enum CompressionCodec {
+    Brotli,
+    Bz,
+    Deflate,
+    Gzip,
+    Lzma,
+    Xz,
+    Zlib,
+    Zstd,
+}
+
+impl CompressionCodec {
+    pub fn from_uri(uri: &str) -> Option<Self> {
+        let url = Url::parse(uri);
+        let path = match &url {
+            Ok(url) => url.path(),
+            _ => uri,
+        };
+        let extension = PathBuf::from(path)
+            .extension()?
+            .to_string_lossy()
+            .to_string();
+        Self::from_extension(extension.as_ref())
+    }
+    pub fn from_extension(extension: &str) -> Option<Self> {
+        use CompressionCodec::*;
+        match extension {
+            "br" => Some(Brotli),
+            "bz2" => Some(Bz),
+            "deflate" => Some(Deflate),
+            "gz" => Some(Gzip),
+            "lzma" => Some(Lzma),
+            "xz" => Some(Xz),
+            "zl" => Some(Zlib),
+            "zstd" | "zst" => Some(Zstd),
+            "snappy" => todo!("Snappy compression support not yet implemented"),
+            _ => None,
+        }
+    }
+
+    pub fn to_decoder<T: AsyncBufRead + Send + 'static>(
+        &self,
+        reader: T,
+    ) -> Pin<Box<dyn AsyncRead + Send>> {
+        use CompressionCodec::*;
+        match self {
+            Brotli => Box::pin(BrotliDecoder::new(reader)),
+            Bz => Box::pin(BzDecoder::new(reader)),
+            Deflate => Box::pin(DeflateDecoder::new(reader)),
+            Gzip => Box::pin(GzipDecoder::new(reader)),
+            Lzma => Box::pin(LzmaDecoder::new(reader)),
+            Xz => Box::pin(XzDecoder::new(reader)),
+            Zlib => Box::pin(ZlibDecoder::new(reader)),
+            Zstd => Box::pin(ZstdDecoder::new(reader)),
+        }
+    }
+}
diff --git a/src/daft-compression/src/lib.rs b/src/daft-compression/src/lib.rs
new file mode 100644
index 0000000000..9b869b3345
--- /dev/null
+++ b/src/daft-compression/src/lib.rs
@@ -0,0 +1,4 @@
+//! Utilities for async decompression of data.
+pub mod compression;
+
+pub use compression::CompressionCodec;
diff --git a/src/daft-csv/Cargo.toml b/src/daft-csv/Cargo.toml
index d0e164b7d9..620316e8d0 100644
--- a/src/daft-csv/Cargo.toml
+++ b/src/daft-csv/Cargo.toml
@@ -9,6 +9,7 @@ chrono = {workspace = true}
 chrono-tz = {workspace = true}
 common-error = {path = "../common/error", default-features = false}
 csv-async = "1.2.6"
+daft-compression = {path = "../daft-compression", default-features = false}
 daft-core = {path = "../daft-core", default-features = false}
 daft-decoding = {path = "../daft-decoding"}
 daft-io = {path = "../daft-io", default-features = false}
diff --git a/src/daft-csv/src/lib.rs b/src/daft-csv/src/lib.rs
index 7541563578..10c3d024a6 100644
--- a/src/daft-csv/src/lib.rs
+++ b/src/daft-csv/src/lib.rs
@@ -5,7 +5,6 @@
 use common_error::DaftError;
 use snafu::Snafu;
 
-mod compression;
 pub mod metadata;
 pub mod options;
 #[cfg(feature = "python")]
diff --git a/src/daft-csv/src/metadata.rs b/src/daft-csv/src/metadata.rs
index 75ac0ad43a..f45f25946b 100644
--- a/src/daft-csv/src/metadata.rs
+++ b/src/daft-csv/src/metadata.rs
@@ -14,7 +14,8 @@ use tokio::{
 };
 use tokio_util::io::StreamReader;
 
-use crate::{compression::CompressionCodec, schema::merge_schema, CsvParseOptions};
+use crate::{schema::merge_schema, CsvParseOptions};
+use daft_compression::CompressionCodec;
 use daft_decoding::inference::infer;
 
 const DEFAULT_COLUMN_PREFIX: &str = "column_";
diff --git a/src/daft-csv/src/read.rs b/src/daft-csv/src/read.rs
index 460e01f377..772bbca00b 100644
--- a/src/daft-csv/src/read.rs
+++ b/src/daft-csv/src/read.rs
@@ -25,8 +25,9 @@ use tokio::{
 };
 use tokio_util::io::StreamReader;
 
-use crate::{compression::CompressionCodec, ArrowSnafu};
+use crate::ArrowSnafu;
 use crate::{metadata::read_csv_schema_single, CsvConvertOptions, CsvParseOptions, CsvReadOptions};
+use daft_compression::CompressionCodec;
 use daft_decoding::deserialize::deserialize_column;
 
 trait ByteRecordChunkStream = Stream<Item = super::Result<Vec<ByteRecord>>>;
@@ -81,64 +82,58 @@ pub fn read_csv_bulk(
 ) -> DaftResult<Vec<Table>> {
     let runtime_handle = get_runtime(multithreaded_io)?;
     let _rt_guard = runtime_handle.enter();
-    let tables = runtime_handle
-        .block_on(async move {
-            // Launch a read task per URI, throttling the number of concurrent file reads to num_parallel tasks.
-            let task_stream = futures::stream::iter(uris.iter().enumerate().map(|(i, uri)| {
-                let (uri, convert_options, parse_options, read_options, io_client, io_stats) = (
-                    uri.to_string(),
-                    convert_options.clone(),
-                    parse_options.clone(),
-                    read_options.clone(),
-                    io_client.clone(),
-                    io_stats.clone(),
-                );
-                tokio::task::spawn(async move {
-                    let table = read_csv_single_into_table(
-                        uri.as_str(),
-                        convert_options,
-                        parse_options,
-                        read_options,
-                        io_client,
-                        io_stats,
-                        max_chunks_in_flight,
-                    )
-                    .await?;
-                    Ok((i, table))
-                })
-            }));
-            let mut remaining_rows = convert_options
-                .as_ref()
-                .and_then(|opts| opts.limit.map(|limit| limit as i64));
-            task_stream
-                // Each task is annotated with its position in the output, so we can use unordered buffering to help mitigate stragglers
-                // and sort the task results at the end.
-                .buffer_unordered(num_parallel_tasks)
-                // Terminate the stream if we have already reached the row limit. With the upstream buffering, we will still read up to
-                // num_parallel_tasks redundant files.
-                .try_take_while(|result| {
-                    match (result, remaining_rows) {
-                        // Limit has been met, early-teriminate.
-                        (_, Some(rows_left)) if rows_left <= 0 => futures::future::ready(Ok(false)),
-                        // Limit has not yet been met, update remaining limit slack and continue.
-                        (Ok((_, table)), Some(rows_left)) => {
-                            remaining_rows = Some(rows_left - table.len() as i64);
-                            futures::future::ready(Ok(true))
-                        }
-                        // (1) No limit, never early-terminate.
-                        // (2) Encountered error, propagate error to try_collect to allow it to short-circuit.
-                        (_, None) | (Err(_), _) => futures::future::ready(Ok(true)),
-                    }
-                })
-                .try_collect::<Vec<_>>()
+    let tables = runtime_handle.block_on(async move {
+        // Launch a read task per URI, throttling the number of concurrent file reads to num_parallel tasks.
+        let task_stream = futures::stream::iter(uris.iter().map(|uri| {
+            let (uri, convert_options, parse_options, read_options, io_client, io_stats) = (
+                uri.to_string(),
+                convert_options.clone(),
+                parse_options.clone(),
+                read_options.clone(),
+                io_client.clone(),
+                io_stats.clone(),
+            );
+            tokio::task::spawn(async move {
+                read_csv_single_into_table(
+                    uri.as_str(),
+                    convert_options,
+                    parse_options,
+                    read_options,
+                    io_client,
+                    io_stats,
+                    max_chunks_in_flight,
+                )
                 .await
-        })
-        .context(super::JoinSnafu {})?;
+            })
+            .context(super::JoinSnafu {})
+        }));
+        let mut remaining_rows = convert_options
+            .as_ref()
+            .and_then(|opts| opts.limit.map(|limit| limit as i64));
+        task_stream
+            // Limit the number of file reads we have in flight at any given time.
+            .buffered(num_parallel_tasks)
+            // Terminate the stream if we have already reached the row limit. With the upstream buffering, we will still read up to
+            // num_parallel_tasks redundant files.
+            .try_take_while(|result| {
+                match (result, remaining_rows) {
+                    // Limit has been met, early-teriminate.
+                    (_, Some(rows_left)) if rows_left <= 0 => futures::future::ready(Ok(false)),
+                    // Limit has not yet been met, update remaining limit slack and continue.
+                    (Ok(table), Some(rows_left)) => {
+                        remaining_rows = Some(rows_left - table.len() as i64);
+                        futures::future::ready(Ok(true))
+                    }
+                    // (1) No limit, never early-terminate.
+                    // (2) Encountered error, propagate error to try_collect to allow it to short-circuit.
+                    (_, None) | (Err(_), _) => futures::future::ready(Ok(true)),
+                }
+            })
+            .try_collect::<Vec<_>>()
+            .await
+    })?;
 
-    // Sort the task results by task index, yielding tables whose order matches the input URI order.
-    let mut collected = tables.into_iter().collect::<DaftResult<Vec<_>>>()?;
-    collected.sort_by_key(|(idx, _)| *idx);
-    Ok(collected.into_iter().map(|(_, v)| v).collect())
+    tables.into_iter().collect::<DaftResult<Vec<_>>>()
 }
 
 async fn read_csv_single_into_table(
@@ -1175,11 +1170,11 @@ mod tests {
             file.as_ref(),
             None,
             None,
-            None,
+            Some(CsvReadOptions::default().with_chunk_size(Some(5))),
             io_client,
             None,
             true,
-            Some(5),
+            Some(2),
         )?;
         assert_eq!(table.len(), 20);
         assert_eq!(
diff --git a/src/daft-decoding/Cargo.toml b/src/daft-decoding/Cargo.toml
index 4a054d0df9..e0eb888cd6 100644
--- a/src/daft-decoding/Cargo.toml
+++ b/src/daft-decoding/Cargo.toml
@@ -1,10 +1,13 @@
 [dependencies]
 arrow2 = {workspace = true, features = ["io_csv", "io_csv_async"]}
+async-compression = {workspace = true}
 chrono = {workspace = true}
 chrono-tz = {workspace = true}
 csv-async = "1.2.6"
 lexical-core = {version = "0.8"}
 simdutf8 = "0.1.3"
+tokio = {workspace = true}
+url = {workspace = true}
 
 [package]
 edition = {workspace = true}
diff --git a/src/daft-decoding/src/deserialize.rs b/src/daft-decoding/src/deserialize.rs
index e8e79156a6..a57a2c2ac2 100644
--- a/src/daft-decoding/src/deserialize.rs
+++ b/src/daft-decoding/src/deserialize.rs
@@ -12,6 +12,8 @@ use csv_async::ByteRecord;
 pub(crate) const ISO8601: &str = "%+";
 pub(crate) const ISO8601_NO_TIME_ZONE: &str = "%Y-%m-%dT%H:%M:%S%.f";
 pub(crate) const ISO8601_NO_TIME_ZONE_NO_FRACTIONAL: &str = "%Y-%m-%dT%H:%M:%S";
+pub(crate) const ISO8601_DATE: &str = "%Y-%m-%d";
+pub(crate) const ISO8601_DATE_SLASHES: &str = "%Y/%m/%d";
 pub(crate) const RFC3339_WITH_SPACE: &str = "%Y-%m-%d %H:%M:%S%.f%:z";
 pub(crate) const RFC3339_WITH_SPACE_NO_TIME_ZONE: &str = "%Y-%m-%d %H:%M:%S%.f";
 pub(crate) const RFC3339_WITH_SPACE_NO_TIME_ZONE_NO_FRACTIONAL: &str = "%Y-%m-%d %H:%M:%S";
@@ -20,8 +22,11 @@ pub(crate) const ALL_NAIVE_TIMESTAMP_FMTS: &[&str] = &[
     ISO8601_NO_TIME_ZONE_NO_FRACTIONAL,
     RFC3339_WITH_SPACE_NO_TIME_ZONE,
     RFC3339_WITH_SPACE_NO_TIME_ZONE_NO_FRACTIONAL,
+    ISO8601_DATE,
+    ISO8601_DATE_SLASHES,
 ];
 pub(crate) const ALL_TIMESTAMP_FMTS: &[&str] = &[ISO8601, RFC3339_WITH_SPACE];
+pub(crate) const ALL_NAIVE_DATE_FMTS: &[&str] = &[ISO8601_DATE, ISO8601_DATE_SLASHES];
 
 // Ideally this trait should not be needed and both `csv` and `csv_async` crates would share
 // the same `ByteRecord` struct. Unfortunately, they do not and thus we must use generics
@@ -153,7 +158,24 @@ fn deserialize_null<B: ByteRecordGeneric>(rows: &[B], _: usize) -> Box<dyn Array
 }
 
 #[inline]
-fn deserialize_naive_datetime(string: &str, fmt_idx: &mut usize) -> Option<chrono::NaiveDateTime> {
+pub fn deserialize_naive_date(string: &str, fmt_idx: &mut usize) -> Option<chrono::NaiveDate> {
+    // TODO(Clark): Parse as all candidate formats in a single pass.
+    for i in 0..ALL_NAIVE_DATE_FMTS.len() {
+        let idx = (i + *fmt_idx) % ALL_NAIVE_DATE_FMTS.len();
+        let fmt = ALL_NAIVE_DATE_FMTS[idx];
+        if let Ok(dt) = chrono::NaiveDate::parse_from_str(string, fmt) {
+            *fmt_idx = idx;
+            return Some(dt);
+        }
+    }
+    None
+}
+
+#[inline]
+pub fn deserialize_naive_datetime(
+    string: &str,
+    fmt_idx: &mut usize,
+) -> Option<chrono::NaiveDateTime> {
     // TODO(Clark): Parse as all candidate formats in a single pass.
     for i in 0..ALL_NAIVE_TIMESTAMP_FMTS.len() {
         let idx = (i + *fmt_idx) % ALL_NAIVE_TIMESTAMP_FMTS.len();
@@ -167,7 +189,7 @@ fn deserialize_naive_datetime(string: &str, fmt_idx: &mut usize) -> Option<chron
 }
 
 #[inline]
-fn deserialize_datetime<T: chrono::TimeZone>(
+pub fn deserialize_datetime<T: chrono::TimeZone>(
     string: &str,
     tz: &T,
     fmt_idx: &mut usize,
@@ -234,13 +256,15 @@ pub fn deserialize_column<B: ByteRecordGeneric>(
             lexical_core::parse::<f64>(bytes).ok()
         }),
         Date32 => deserialize_primitive(rows, column, datatype, |bytes| {
+            let mut last_fmt_idx = 0;
             to_utf8(bytes)
-                .and_then(|x| x.parse::<chrono::NaiveDate>().ok())
+                .and_then(|x| deserialize_naive_date(x, &mut last_fmt_idx))
                 .map(|x| x.num_days_from_ce() - temporal_conversions::EPOCH_DAYS_FROM_CE)
         }),
         Date64 => deserialize_primitive(rows, column, datatype, |bytes| {
+            let mut last_fmt_idx = 0;
             to_utf8(bytes)
-                .and_then(|x| x.parse::<chrono::NaiveDateTime>().ok())
+                .and_then(|x| deserialize_naive_datetime(x, &mut last_fmt_idx))
                 .map(|x| x.timestamp_millis())
         }),
         Time32(time_unit) => deserialize_primitive(rows, column, datatype, |bytes| {
@@ -310,7 +334,7 @@ pub fn deserialize_column<B: ByteRecordGeneric>(
 }
 
 // Return the factor by how small is a time unit compared to seconds
-fn get_factor_from_timeunit(time_unit: TimeUnit) -> u32 {
+pub fn get_factor_from_timeunit(time_unit: TimeUnit) -> u32 {
     match time_unit {
         TimeUnit::Second => 1,
         TimeUnit::Millisecond => 1_000,
diff --git a/src/daft-decoding/src/inference.rs b/src/daft-decoding/src/inference.rs
index e1f4fb259f..b91630c9a0 100644
--- a/src/daft-decoding/src/inference.rs
+++ b/src/daft-decoding/src/inference.rs
@@ -1,7 +1,7 @@
-use arrow2::datatypes::TimeUnit;
+use arrow2::datatypes::{DataType, TimeUnit};
 use chrono::Timelike;
 
-use crate::deserialize::{ALL_NAIVE_TIMESTAMP_FMTS, ALL_TIMESTAMP_FMTS};
+use crate::deserialize::{ALL_NAIVE_DATE_FMTS, ALL_NAIVE_TIMESTAMP_FMTS, ALL_TIMESTAMP_FMTS};
 
 /// Infers [`DataType`] from `bytes`
 /// # Implementation
@@ -16,7 +16,6 @@ use crate::deserialize::{ALL_NAIVE_TIMESTAMP_FMTS, ALL_TIMESTAMP_FMTS};
 /// * other utf8 is mapped to [`DataType::Utf8`]
 /// * invalid utf8 is mapped to [`DataType::Binary`]
 pub fn infer(bytes: &[u8]) -> arrow2::datatypes::DataType {
-    use arrow2::datatypes::DataType;
     if is_null(bytes) {
         DataType::Null
     } else if is_boolean(bytes) {
@@ -26,23 +25,30 @@ pub fn infer(bytes: &[u8]) -> arrow2::datatypes::DataType {
     } else if is_float(bytes) {
         DataType::Float64
     } else if let Ok(string) = simdutf8::basic::from_utf8(bytes) {
-        if is_date(string) {
-            DataType::Date32
-        } else if is_time(string) {
-            DataType::Time32(TimeUnit::Millisecond)
-        } else if let Some(time_unit) = is_naive_datetime(string) {
-            DataType::Timestamp(time_unit, None)
-        } else if let Some((time_unit, offset)) = is_datetime(string) {
-            DataType::Timestamp(time_unit, Some(offset))
-        } else {
-            DataType::Utf8
-        }
+        infer_string(string)
     } else {
         // invalid utf8
         DataType::Binary
     }
 }
 
+pub fn infer_string(string: &str) -> DataType {
+    if is_date(string) {
+        DataType::Date32
+    } else if let Some(time_unit) = is_time(string) {
+        DataType::Time32(time_unit)
+    } else if let Some((time_unit, offset)) = is_datetime(string) {
+        // NOTE: We try to parse as a non-naive datatime (with timezone information) first,
+        // since is_datetime() will return false if timezone information is not present in the string,
+        // while is_naive_datetime() will ignore timezone information in the string.
+        DataType::Timestamp(time_unit, Some(offset))
+    } else if let Some(time_unit) = is_naive_datetime(string) {
+        DataType::Timestamp(time_unit, None)
+    } else {
+        DataType::Utf8
+    }
+}
+
 fn is_null(bytes: &[u8]) -> bool {
     bytes.is_empty()
 }
@@ -60,11 +66,20 @@ fn is_integer(bytes: &[u8]) -> bool {
 }
 
 fn is_date(string: &str) -> bool {
-    string.parse::<chrono::NaiveDate>().is_ok()
+    for fmt in ALL_NAIVE_DATE_FMTS {
+        if chrono::NaiveDate::parse_from_str(string, fmt).is_ok() {
+            return true;
+        }
+    }
+    false
 }
 
-fn is_time(string: &str) -> bool {
-    string.parse::<chrono::NaiveTime>().is_ok()
+fn is_time(string: &str) -> Option<TimeUnit> {
+    if let Ok(t) = string.parse::<chrono::NaiveTime>() {
+        let time_unit = nanoseconds_to_time_unit(t.nanosecond());
+        return Some(time_unit);
+    }
+    None
 }
 
 fn is_naive_datetime(string: &str) -> Option<TimeUnit> {
diff --git a/src/daft-json/Cargo.toml b/src/daft-json/Cargo.toml
new file mode 100644
index 0000000000..74060528e0
--- /dev/null
+++ b/src/daft-json/Cargo.toml
@@ -0,0 +1,48 @@
+[dependencies]
+arrow2 = {workspace = true, features = ["io_json"]}
+async-compat = {workspace = true}
+async-compression = {workspace = true}
+async-stream = {workspace = true}
+bincode = {workspace = true}
+bytes = {workspace = true}
+chrono = {workspace = true}
+chrono-tz = {workspace = true}
+common-error = {path = "../common/error", default-features = false}
+daft-compression = {path = "../daft-compression", default-features = false}
+daft-core = {path = "../daft-core", default-features = false}
+daft-decoding = {path = "../daft-decoding"}
+daft-io = {path = "../daft-io", default-features = false}
+daft-table = {path = "../daft-table", default-features = false}
+futures = {workspace = true}
+indexmap = {workspace = true}
+lexical-core = {version = "0.8"}
+log = {workspace = true}
+num-traits = {workspace = true}
+pyo3 = {workspace = true, optional = true}
+pyo3-log = {workspace = true, optional = true}
+rayon = {workspace = true}
+serde = {workspace = true}
+simdutf8 = "0.1.3"
+snafu = {workspace = true}
+tokio = {workspace = true}
+tokio-stream = {workspace = true, features = ["io-util"]}
+tokio-util = {workspace = true}
+url = {workspace = true}
+
+[dependencies.json-deserializer]
+features = ["preserve_order"]
+git = "https://github.com/Eventual-Inc/json-deserializer"
+package = "json-deserializer"
+rev = "4be9205900888ff0767e0c37642884f2c912e22c"
+
+[dev-dependencies]
+rstest = {workspace = true}
+
+[features]
+default = ["python"]
+python = ["dep:pyo3", "dep:pyo3-log", "common-error/python", "daft-core/python", "daft-io/python", "daft-table/python"]
+
+[package]
+edition = {workspace = true}
+name = "daft-json"
+version = {workspace = true}
diff --git a/src/daft-json/src/decoding.rs b/src/daft-json/src/decoding.rs
new file mode 100644
index 0000000000..e8de99b8e3
--- /dev/null
+++ b/src/daft-json/src/decoding.rs
@@ -0,0 +1,537 @@
+use std::borrow::Borrow;
+
+use arrow2::array::{
+    Array, MutableArray, MutableBooleanArray, MutableFixedSizeListArray, MutableListArray,
+    MutableNullArray, MutablePrimitiveArray, MutableStructArray, MutableUtf8Array,
+};
+use arrow2::bitmap::MutableBitmap;
+use arrow2::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit};
+use arrow2::error::{Error, Result};
+use arrow2::offset::Offsets;
+use arrow2::temporal_conversions;
+use arrow2::types::{f16, NativeType, Offset};
+use chrono::{Datelike, Timelike};
+use daft_decoding::deserialize::{
+    deserialize_datetime, deserialize_naive_date, deserialize_naive_datetime,
+    get_factor_from_timeunit,
+};
+use indexmap::IndexMap;
+use json_deserializer::{Number, Value};
+
+/// Deserialize chunk of JSON records into a chunk of Arrow2 arrays.
+pub(crate) fn deserialize_records(
+    records: Vec<Value>,
+    schema: &Schema,
+    schema_is_projection: bool,
+) -> Result<Vec<Box<dyn Array>>> {
+    // Allocate mutable arrays.
+    let mut results = schema
+        .fields
+        .iter()
+        .map(|f| (&f.name, allocate_array(f, records.len())))
+        .collect::<IndexMap<_, _>>();
+
+    for record in records {
+        match record {
+            Value::Object(record) => {
+                for (key, value) in record.iter() {
+                    let arr = results.get_mut(key);
+                    if let Some(arr) = arr {
+                        deserialize_into(arr, &[value]);
+                    } else if !schema_is_projection {
+                        // Provided schema is either the full schema or a projection.
+                        // If this key isn't in the schema-derived array map AND there was no projection,
+                        // we return an error. Otherwise, we drop this key-value pair.
+                        return Err(Error::ExternalFormat(format!("unexpected key: '{key}'")));
+                    }
+                }
+            }
+            _ => {
+                return Err(Error::ExternalFormat(format!(
+                "Each line in a newline-delimited JSON file must be a JSON object, but got: {:?}",
+                record
+            )))
+            }
+        }
+    }
+
+    Ok(results.into_values().map(|mut ma| ma.as_box()).collect())
+}
+
+fn allocate_array(f: &Field, length: usize) -> Box<dyn MutableArray> {
+    match f.data_type() {
+        DataType::Null => Box::new(MutableNullArray::new(DataType::Null, 0)),
+        DataType::Int8 => Box::new(MutablePrimitiveArray::<i8>::with_capacity(length)),
+        DataType::Int16 => Box::new(MutablePrimitiveArray::<i16>::with_capacity(length)),
+        dt @ (DataType::Int32
+        | DataType::Date32
+        | DataType::Time32(_)
+        | DataType::Interval(IntervalUnit::YearMonth)) => {
+            Box::new(MutablePrimitiveArray::<i32>::with_capacity(length).to(dt.clone()))
+        }
+        dt @ (DataType::Int64
+        | DataType::Date64
+        | DataType::Time64(_)
+        | DataType::Duration(_)
+        | DataType::Timestamp(..)) => {
+            Box::new(MutablePrimitiveArray::<i64>::with_capacity(length).to(dt.clone()))
+        }
+        DataType::UInt8 => Box::new(MutablePrimitiveArray::<u8>::with_capacity(length)),
+        DataType::UInt16 => Box::new(MutablePrimitiveArray::<u16>::with_capacity(length)),
+        DataType::UInt32 => Box::new(MutablePrimitiveArray::<u32>::with_capacity(length)),
+        DataType::UInt64 => Box::new(MutablePrimitiveArray::<u64>::with_capacity(length)),
+        DataType::Float16 => Box::new(MutablePrimitiveArray::<f16>::with_capacity(length)),
+        DataType::Float32 => Box::new(MutablePrimitiveArray::<f32>::with_capacity(length)),
+        DataType::Float64 => Box::new(MutablePrimitiveArray::<f64>::with_capacity(length)),
+        DataType::Boolean => Box::new(MutableBooleanArray::with_capacity(length)),
+        DataType::Utf8 => Box::new(MutableUtf8Array::<i32>::with_capacity(length)),
+        DataType::LargeUtf8 => Box::new(MutableUtf8Array::<i64>::with_capacity(length)),
+        DataType::FixedSizeList(inner, size) => Box::new(MutableFixedSizeListArray::new_from(
+            allocate_array(inner, length),
+            f.data_type().clone(),
+            *size,
+        )),
+        // TODO(Clark): Ensure that these mutable list arrays work correctly and efficiently for arbitrarily nested arrays.
+        // TODO(Clark): We have to manually give a non-None bitmap due to a bug in try_extend_from_lengths for
+        // mutable list arrays, which will unintentionally drop the validity mask if the bitmap isn't already non-None.
+        DataType::List(inner) => Box::new(MutableListArray::new_from_mutable(
+            allocate_array(inner, length),
+            Offsets::<i32>::with_capacity(length),
+            Some(MutableBitmap::with_capacity(length)),
+        )),
+        DataType::LargeList(inner) => Box::new(MutableListArray::new_from_mutable(
+            allocate_array(inner, length),
+            Offsets::<i64>::with_capacity(length),
+            Some(MutableBitmap::with_capacity(length)),
+        )),
+        // TODO(Clark): We have to manually give a non-None bitmap due to a bug in MutableStructArray::push(), which will
+        // unintentionally drop the first null added to the validity mask if a bitmap hasn't been initialized from the start.
+        dt @ DataType::Struct(inner) => Box::new(
+            MutableStructArray::try_new(
+                dt.clone(),
+                inner
+                    .iter()
+                    .map(|field| allocate_array(field, length))
+                    .collect::<Vec<_>>(),
+                Some(MutableBitmap::with_capacity(length)),
+            )
+            .unwrap(),
+        ),
+        dt => todo!("Dtype not supported: {:?}", dt),
+    }
+}
+
+/// Deserialize `rows` by extending them into the given `target`
+fn deserialize_into<'a, A: Borrow<Value<'a>>>(target: &mut Box<dyn MutableArray>, rows: &[A]) {
+    match target.data_type() {
+        DataType::Null => {
+            // TODO(Clark): Return an error if any of rows are not Value::Null.
+            for _ in 0..rows.len() {
+                target.push_null()
+            }
+        }
+        DataType::Boolean => generic_deserialize_into(target, rows, deserialize_boolean_into),
+        DataType::Float32 => {
+            deserialize_primitive_into::<_, f32>(target, rows, deserialize_float_into)
+        }
+        DataType::Float64 => {
+            deserialize_primitive_into::<_, f64>(target, rows, deserialize_float_into)
+        }
+        DataType::Int8 => deserialize_primitive_into::<_, i8>(target, rows, deserialize_int_into),
+        DataType::Int16 => deserialize_primitive_into::<_, i16>(target, rows, deserialize_int_into),
+        DataType::Int32 | DataType::Interval(IntervalUnit::YearMonth) => {
+            deserialize_primitive_into::<_, i32>(target, rows, deserialize_int_into)
+        }
+        DataType::Date32 | DataType::Time32(_) => {
+            deserialize_primitive_into::<_, i32>(target, rows, deserialize_date_into)
+        }
+        DataType::Interval(IntervalUnit::DayTime) => {
+            unimplemented!("There is no natural representation of DayTime in JSON.")
+        }
+        DataType::Int64 | DataType::Duration(_) => {
+            deserialize_primitive_into::<_, i64>(target, rows, deserialize_int_into)
+        }
+        DataType::Timestamp(..) | DataType::Date64 | DataType::Time64(_) => {
+            deserialize_primitive_into::<_, i64>(target, rows, deserialize_datetime_into)
+        }
+        DataType::UInt8 => deserialize_primitive_into::<_, u8>(target, rows, deserialize_int_into),
+        DataType::UInt16 => {
+            deserialize_primitive_into::<_, u16>(target, rows, deserialize_int_into)
+        }
+        DataType::UInt32 => {
+            deserialize_primitive_into::<_, u32>(target, rows, deserialize_int_into)
+        }
+        DataType::UInt64 => {
+            deserialize_primitive_into::<_, u64>(target, rows, deserialize_int_into)
+        }
+        DataType::Utf8 => generic_deserialize_into::<_, MutableUtf8Array<i32>>(
+            target,
+            rows,
+            deserialize_utf8_into,
+        ),
+        DataType::LargeUtf8 => generic_deserialize_into::<_, MutableUtf8Array<i64>>(
+            target,
+            rows,
+            deserialize_utf8_into,
+        ),
+        DataType::FixedSizeList(_, _) => {
+            generic_deserialize_into(target, rows, deserialize_fixed_size_list_into)
+        }
+        DataType::List(_) => deserialize_list_into(
+            target
+                .as_mut_any()
+                .downcast_mut::<MutableListArray<i32, Box<dyn MutableArray>>>()
+                .unwrap(),
+            rows,
+        ),
+        DataType::LargeList(_) => deserialize_list_into(
+            target
+                .as_mut_any()
+                .downcast_mut::<MutableListArray<i64, Box<dyn MutableArray>>>()
+                .unwrap(),
+            rows,
+        ),
+        DataType::Struct(_) => {
+            generic_deserialize_into::<_, MutableStructArray>(target, rows, deserialize_struct_into)
+        }
+        // TODO(Clark): Add support for decimal type.
+        // TODO(Clark): Add support for binary and large binary types.
+        dt => {
+            todo!("Dtype not supported: {:?}", dt)
+        }
+    }
+}
+
+fn deserialize_primitive_into<'a, A: Borrow<Value<'a>>, T: NativeType>(
+    target: &mut Box<dyn MutableArray>,
+    rows: &[A],
+    deserialize_into: fn(&mut MutablePrimitiveArray<T>, &[A]) -> (),
+) {
+    generic_deserialize_into(target, rows, deserialize_into)
+}
+
+fn generic_deserialize_into<'a, A: Borrow<Value<'a>>, M: 'static>(
+    target: &mut Box<dyn MutableArray>,
+    rows: &[A],
+    deserialize_into: fn(&mut M, &[A]) -> (),
+) {
+    deserialize_into(target.as_mut_any().downcast_mut::<M>().unwrap(), rows);
+}
+
+fn deserialize_utf8_into<'a, O: Offset, A: Borrow<Value<'a>>>(
+    target: &mut MutableUtf8Array<O>,
+    rows: &[A],
+) {
+    let mut scratch = vec![];
+    for row in rows {
+        match row.borrow() {
+            Value::String(v) => target.push(Some(v.as_ref())),
+            Value::Number(number) => match number {
+                Number::Integer(number, exponent) | Number::Float(number, exponent) => {
+                    scratch.clear();
+                    scratch.extend_from_slice(number);
+                    if !exponent.is_empty() {
+                        scratch.push(b'e');
+                        scratch.extend_from_slice(exponent);
+                    }
+                    target.push(simdutf8::basic::from_utf8(scratch.as_slice()).ok());
+                }
+            },
+            Value::Bool(v) => target.push(Some(if *v { "true" } else { "false" })),
+            _ => target.push_null(),
+        }
+    }
+}
+
+fn deserialize_int_into<
+    'a,
+    T: NativeType + lexical_core::FromLexical + Pow10,
+    A: Borrow<Value<'a>>,
+>(
+    target: &mut MutablePrimitiveArray<T>,
+    rows: &[A],
+) {
+    let iter = rows.iter().map(|row| match row.borrow() {
+        Value::Number(number) => Some(deserialize_int_single(*number)),
+        Value::Bool(number) => Some(if *number { T::one() } else { T::default() }),
+        _ => None,
+    });
+    target.extend_trusted_len(iter);
+}
+
+fn deserialize_float_into<
+    'a,
+    T: NativeType + lexical_core::FromLexical + Powi10,
+    A: Borrow<Value<'a>>,
+>(
+    target: &mut MutablePrimitiveArray<T>,
+    rows: &[A],
+) {
+    let iter = rows.iter().map(|row| match row.borrow() {
+        Value::Number(number) => Some(deserialize_float_single(number)),
+        Value::Bool(number) => Some(if *number { T::one() } else { T::default() }),
+        _ => None,
+    });
+    target.extend_trusted_len(iter);
+}
+
+fn deserialize_boolean_into<'a, A: Borrow<Value<'a>>>(
+    target: &mut MutableBooleanArray,
+    rows: &[A],
+) {
+    let iter = rows.iter().map(|row| match row.borrow() {
+        Value::Bool(v) => Some(v),
+        _ => None,
+    });
+    target.extend_trusted_len(iter);
+}
+
+fn deserialize_int_single<T>(number: Number) -> T
+where
+    T: NativeType + lexical_core::FromLexical + Pow10,
+{
+    match number {
+        Number::Float(fraction, exponent) => {
+            let integer = fraction.split(|x| *x == b'.').next().unwrap();
+            let mut integer: T = lexical_core::parse(integer).unwrap();
+            if !exponent.is_empty() {
+                let exponent: u32 = lexical_core::parse(exponent).unwrap();
+                integer = integer.pow10(exponent);
+            }
+            integer
+        }
+        Number::Integer(integer, exponent) => {
+            let mut integer: T = lexical_core::parse(integer).unwrap();
+            if !exponent.is_empty() {
+                let exponent: u32 = lexical_core::parse(exponent).unwrap();
+                integer = integer.pow10(exponent);
+            }
+            integer
+        }
+    }
+}
+
+trait Powi10: NativeType + num_traits::One + std::ops::Add {
+    fn powi10(self, exp: i32) -> Self;
+}
+
+impl Powi10 for f32 {
+    #[inline]
+    fn powi10(self, exp: i32) -> Self {
+        self * 10.0f32.powi(exp)
+    }
+}
+
+impl Powi10 for f64 {
+    #[inline]
+    fn powi10(self, exp: i32) -> Self {
+        self * 10.0f64.powi(exp)
+    }
+}
+
+trait Pow10: NativeType + num_traits::One + std::ops::Add {
+    fn pow10(self, exp: u32) -> Self;
+}
+
+macro_rules! impl_pow10 {
+    ($ty:ty) => {
+        impl Pow10 for $ty {
+            #[inline]
+            fn pow10(self, exp: u32) -> Self {
+                self * (10 as $ty).pow(exp)
+            }
+        }
+    };
+}
+impl_pow10!(u8);
+impl_pow10!(u16);
+impl_pow10!(u32);
+impl_pow10!(u64);
+impl_pow10!(i8);
+impl_pow10!(i16);
+impl_pow10!(i32);
+impl_pow10!(i64);
+
+fn deserialize_float_single<T>(number: &Number) -> T
+where
+    T: NativeType + lexical_core::FromLexical + Powi10,
+{
+    match number {
+        Number::Float(float, exponent) => {
+            let mut float: T = lexical_core::parse(float).unwrap();
+            if !exponent.is_empty() {
+                let exponent: i32 = lexical_core::parse(exponent).unwrap();
+                float = float.powi10(exponent);
+            }
+            float
+        }
+        Number::Integer(integer, exponent) => {
+            let mut float: T = lexical_core::parse(integer).unwrap();
+            if !exponent.is_empty() {
+                let exponent: i32 = lexical_core::parse(exponent).unwrap();
+                float = float.powi10(exponent);
+            }
+            float
+        }
+    }
+}
+
+fn deserialize_date_into<'a, A: Borrow<Value<'a>>>(
+    target: &mut MutablePrimitiveArray<i32>,
+    rows: &[A],
+) {
+    let dtype = target.data_type().clone();
+    let mut last_fmt_idx = 0;
+    let iter = rows.iter().map(|row| match row.borrow() {
+        Value::Number(v) => Some(deserialize_int_single(*v)),
+        Value::String(v) => match dtype {
+            DataType::Time32(tu) => {
+                let factor = get_factor_from_timeunit(tu);
+                v.parse::<chrono::NaiveTime>().ok().map(|x| {
+                    (x.hour() * 3_600 * factor
+                        + x.minute() * 60 * factor
+                        + x.second() * factor
+                        + x.nanosecond() / (1_000_000_000 / factor)) as i32
+                })
+            }
+            DataType::Date32 => deserialize_naive_date(v, &mut last_fmt_idx)
+                .map(|x| x.num_days_from_ce() - temporal_conversions::EPOCH_DAYS_FROM_CE),
+            _ => unreachable!(),
+        },
+        _ => None,
+    });
+    target.extend_trusted_len(iter);
+}
+
+fn deserialize_datetime_into<'a, A: Borrow<Value<'a>>>(
+    target: &mut MutablePrimitiveArray<i64>,
+    rows: &[A],
+) {
+    let dtype = target.data_type().clone();
+    let mut last_fmt_idx = 0;
+    let iter = rows.iter().map(|row| match row.borrow() {
+        Value::Number(v) => Some(deserialize_int_single(*v)),
+        Value::String(v) => match dtype {
+            DataType::Time64(tu) => {
+                let factor = get_factor_from_timeunit(tu) as u64;
+                v.parse::<chrono::NaiveTime>().ok().map(|x| {
+                    (x.hour() as u64 * 3_600 * factor
+                        + x.minute() as u64 * 60 * factor
+                        + x.second() as u64 * factor
+                        + x.nanosecond() as u64 / (1_000_000_000 / factor))
+                        as i64
+                })
+            }
+            DataType::Date64 => {
+                deserialize_naive_datetime(v, &mut last_fmt_idx).map(|x| x.timestamp_millis())
+            }
+            DataType::Timestamp(tu, None) => deserialize_naive_datetime(v, &mut last_fmt_idx)
+                .and_then(|dt| match tu {
+                    TimeUnit::Second => Some(dt.timestamp()),
+                    TimeUnit::Millisecond => Some(dt.timestamp_millis()),
+                    TimeUnit::Microsecond => Some(dt.timestamp_micros()),
+                    TimeUnit::Nanosecond => dt.timestamp_nanos_opt(),
+                }),
+            DataType::Timestamp(tu, Some(ref tz)) => {
+                let tz = if tz == "Z" { "UTC" } else { tz };
+                let tz = temporal_conversions::parse_offset(tz).unwrap();
+                deserialize_datetime(v, &tz, &mut last_fmt_idx).and_then(|dt| match tu {
+                    TimeUnit::Second => Some(dt.timestamp()),
+                    TimeUnit::Millisecond => Some(dt.timestamp_millis()),
+                    TimeUnit::Microsecond => Some(dt.timestamp_micros()),
+                    TimeUnit::Nanosecond => dt.timestamp_nanos_opt(),
+                })
+            }
+            _ => unreachable!(),
+        },
+        _ => None,
+    });
+    target.extend_trusted_len(iter);
+}
+
+fn deserialize_list_into<'a, O: Offset, A: Borrow<Value<'a>>>(
+    target: &mut MutableListArray<O, Box<dyn MutableArray>>,
+    rows: &[A],
+) {
+    let empty = vec![];
+    let inner: Vec<_> = rows
+        .iter()
+        .flat_map(|row| match row.borrow() {
+            Value::Array(value) => value.iter(),
+            _ => empty.iter(),
+        })
+        .collect();
+
+    deserialize_into(target.mut_values(), &inner);
+
+    let lengths = rows.iter().map(|row| match row.borrow() {
+        Value::Array(value) => Some(value.len()),
+        _ => None,
+    });
+
+    // NOTE(Clark): A bug in Arrow2 will cause the validity mask to be dropped if it's currently None in target,
+    // which will be the case unless we explicitly initialize the mutable array with a bitmap.
+    target
+        .try_extend_from_lengths(lengths)
+        .expect("Offsets overflow");
+}
+
+fn deserialize_fixed_size_list_into<'a, A: Borrow<Value<'a>>>(
+    target: &mut MutableFixedSizeListArray<Box<dyn MutableArray>>,
+    rows: &[A],
+) {
+    for row in rows {
+        match row.borrow() {
+            Value::Array(value) => {
+                if value.len() == target.size() {
+                    deserialize_into(target.mut_values(), value);
+                    // Unless alignment is already off, the if above should
+                    // prevent this from ever happening.
+                    target.try_push_valid().expect("unaligned backing array");
+                } else {
+                    // TODO(Clark): Return an error instead of dropping incorrectly sized lists.
+                    target.push_null();
+                }
+            }
+            _ => target.push_null(),
+        }
+    }
+}
+
+fn deserialize_struct_into<'a, A: Borrow<Value<'a>>>(target: &mut MutableStructArray, rows: &[A]) {
+    let dtype = target.data_type().clone();
+    // Build a map from struct field -> JSON values.
+    let mut values = match dtype {
+        DataType::Struct(fields) => fields
+            .into_iter()
+            .map(|field| (field.name, vec![]))
+            .collect::<IndexMap<_, _>>(),
+        _ => unreachable!(),
+    };
+    rows.iter().for_each(|row| {
+        match row.borrow() {
+            Value::Object(value) => {
+                values.iter_mut().for_each(|(s, inner)| {
+                    inner.push(value.get(s).unwrap_or(&Value::Null));
+                });
+                target.push(true);
+            }
+            _ => {
+                values
+                    .iter_mut()
+                    .for_each(|(_, inner)| inner.push(&Value::Null));
+                target.push(false);
+            }
+        };
+    });
+    // Then deserialize each field's JSON values buffer to the appropriate Arrow2 array.
+    //
+    // Column ordering invariant - this assumes that values and target.mut_values() have aligned columns;
+    // we can assume this because:
+    // - target.mut_values() is guaranteed to have the same column ordering as target.data_type().fields,
+    // - values is an ordered map, whose ordering is tied to target.data_type().fields.
+    values
+        .into_values()
+        .zip(target.mut_values())
+        .for_each(|(col_values, col_mut_arr)| deserialize_into(col_mut_arr, col_values.as_slice()));
+}
diff --git a/src/daft-json/src/inference.rs b/src/daft-json/src/inference.rs
new file mode 100644
index 0000000000..4afac92836
--- /dev/null
+++ b/src/daft-json/src/inference.rs
@@ -0,0 +1,233 @@
+use std::{borrow::Borrow, collections::HashSet};
+
+use arrow2::datatypes::{DataType, Field, Metadata, Schema, TimeUnit};
+use arrow2::error::{Error, Result};
+use indexmap::IndexMap;
+use json_deserializer::{Number, Value};
+
+const ITEM_NAME: &str = "item";
+
+/// Infer Arrow2 schema from JSON Value record.
+pub(crate) fn infer_records_schema(record: &Value) -> Result<Schema> {
+    let fields = match record {
+        Value::Object(record) => record
+            .iter()
+            .map(|(name, value)| {
+                let data_type = infer(value)?;
+
+                Ok(Field {
+                    name: name.clone(),
+                    data_type,
+                    is_nullable: true,
+                    metadata: Metadata::default(),
+                })
+            })
+            .collect::<Result<Vec<_>>>(),
+        _ => Err(Error::ExternalFormat(
+            "Deserialized JSON value is not an Object record".to_string(),
+        )),
+    }?;
+
+    Ok(Schema {
+        fields,
+        metadata: Metadata::default(),
+    })
+}
+
+/// Infers [`DataType`] from [`Value`].
+fn infer(json: &Value) -> Result<DataType> {
+    Ok(match json {
+        Value::Bool(_) => DataType::Boolean,
+        Value::Array(array) => infer_array(array)?,
+        Value::Null => DataType::Null,
+        Value::Number(number) => infer_number(number),
+        Value::String(string) => infer_string(string),
+        Value::Object(inner) => infer_object(inner)?,
+    })
+}
+
+fn infer_string(string: &str) -> DataType {
+    daft_decoding::inference::infer_string(string)
+}
+
+fn infer_object(inner: &IndexMap<String, Value>) -> Result<DataType> {
+    let fields = inner
+        .iter()
+        .map(|(key, value)| infer(value).map(|dt| (key, dt)))
+        .map(|maybe_dt| {
+            let (key, dt) = maybe_dt?;
+            Ok(Field::new(key, dt, true))
+        })
+        .collect::<Result<Vec<_>>>()?;
+    Ok(DataType::Struct(fields))
+}
+
+fn infer_array(values: &[Value]) -> Result<DataType> {
+    let types = values
+        .iter()
+        .map(infer)
+        // Deduplicate dtypes.
+        .collect::<Result<HashSet<_>>>()?;
+
+    let dt = if !types.is_empty() {
+        coerce_data_type(types)
+    } else {
+        DataType::Null
+    };
+
+    // Don't add a record that contains only nulls.
+    Ok(if dt == DataType::Null {
+        dt
+    } else {
+        DataType::List(Box::new(Field::new(ITEM_NAME, dt, true)))
+    })
+}
+
+fn infer_number(n: &Number) -> DataType {
+    match n {
+        Number::Float(..) => DataType::Float64,
+        Number::Integer(..) => DataType::Int64,
+    }
+}
+
+/// Convert each column's set of infered dtypes to a field with a consolidated dtype, following the coercion rules
+/// defined in coerce_data_type.
+pub(crate) fn column_types_map_to_fields(
+    column_types: IndexMap<String, HashSet<arrow2::datatypes::DataType>>,
+) -> Vec<arrow2::datatypes::Field> {
+    column_types
+        .into_iter()
+        .map(|(name, dtype_set)| {
+            // Get consolidated dtype for column.
+            let dtype = coerce_data_type(dtype_set);
+            arrow2::datatypes::Field::new(name, dtype, true)
+        })
+        .collect::<Vec<_>>()
+}
+
+/// Coerce an heterogeneous set of [`DataType`] into a single one. Rules:
+/// * The empty set is coerced to `Null`
+/// * `Int64` and `Float64` are `Float64`
+/// * Lists and scalars are coerced to a list of a compatible scalar
+/// * Structs contain the union of all fields
+/// * All other types are coerced to `Utf8`
+pub(crate) fn coerce_data_type(mut datatypes: HashSet<DataType>) -> DataType {
+    // Drop null dtype from the dtype set.
+    datatypes.remove(&DataType::Null);
+
+    if datatypes.is_empty() {
+        return DataType::Null;
+    }
+
+    // All equal.
+    if datatypes.len() == 1 {
+        return datatypes.into_iter().next().unwrap();
+    }
+
+    let are_all_structs = datatypes
+        .iter()
+        .all(|x| matches!((*x).borrow(), DataType::Struct(_)));
+
+    if are_all_structs {
+        // All structs => union of all field dtypes (these may have equal names).
+        let fields = datatypes.into_iter().fold(vec![], |mut acc, dt| {
+            if let DataType::Struct(new_fields) = dt {
+                acc.extend(new_fields);
+            };
+            acc
+        });
+        // Group fields by unique names.
+        let fields = fields.into_iter().fold(
+            IndexMap::<String, HashSet<DataType>>::new(),
+            |mut acc, field| {
+                match acc.entry(field.name) {
+                    indexmap::map::Entry::Occupied(mut v) => {
+                        v.get_mut().insert(field.data_type);
+                    }
+                    indexmap::map::Entry::Vacant(v) => {
+                        let mut a = HashSet::new();
+                        a.insert(field.data_type);
+                        v.insert(a);
+                    }
+                }
+                acc
+            },
+        );
+        // Coerce dtype set for each field.
+        let fields = fields
+            .into_iter()
+            .map(|(name, dts)| Field::new(name, coerce_data_type(dts), true))
+            .collect();
+        return DataType::Struct(fields);
+    }
+    datatypes
+        .into_iter()
+        .reduce(|lhs, rhs| {
+            match (lhs, rhs) {
+                (lhs, rhs) if lhs == rhs => lhs,
+                (DataType::Utf8, _) | (_, DataType::Utf8) => DataType::Utf8,
+                (DataType::List(lhs), DataType::List(rhs)) => {
+                    let inner =
+                        coerce_data_type([lhs.data_type().clone(), rhs.data_type().clone()].into());
+                    DataType::List(Box::new(Field::new(ITEM_NAME, inner, true)))
+                }
+                (scalar, DataType::List(list)) | (DataType::List(list), scalar) => {
+                    let inner = coerce_data_type([scalar, list.data_type().clone()].into());
+                    DataType::List(Box::new(Field::new(ITEM_NAME, inner, true)))
+                }
+                (DataType::Float64, DataType::Int64) | (DataType::Int64, DataType::Float64) => {
+                    DataType::Float64
+                }
+                (DataType::Int64, DataType::Boolean) | (DataType::Boolean, DataType::Int64) => {
+                    DataType::Int64
+                }
+                (DataType::Time32(left_tu), DataType::Time32(right_tu)) => {
+                    // Set unified time unit to the lowest granularity time unit.
+                    let unified_tu = if left_tu == right_tu
+                        || time_unit_to_ordinal(&left_tu) < time_unit_to_ordinal(&right_tu)
+                    {
+                        left_tu
+                    } else {
+                        right_tu
+                    };
+                    DataType::Time32(unified_tu)
+                }
+                (
+                    DataType::Timestamp(left_tu, left_tz),
+                    DataType::Timestamp(right_tu, right_tz),
+                ) => {
+                    // Set unified time unit to the lowest granularity time unit.
+                    let unified_tu = if left_tu == right_tu
+                        || time_unit_to_ordinal(&left_tu) < time_unit_to_ordinal(&right_tu)
+                    {
+                        left_tu
+                    } else {
+                        right_tu
+                    };
+                    // Set unified time zone to UTC.
+                    let unified_tz = match (&left_tz, &right_tz) {
+                        (None, None) => None,
+                        (None, _) | (_, None) => return DataType::Utf8,
+                        (Some(l), Some(r)) if l == r => left_tz,
+                        (Some(_), Some(_)) => Some("Z".to_string()),
+                    };
+                    DataType::Timestamp(unified_tu, unified_tz)
+                }
+                (DataType::Timestamp(_, None), DataType::Date32)
+                | (DataType::Date32, DataType::Timestamp(_, None)) => {
+                    DataType::Timestamp(TimeUnit::Second, None)
+                }
+                (_, _) => DataType::Utf8,
+            }
+        })
+        .unwrap()
+}
+
+fn time_unit_to_ordinal(tu: &TimeUnit) -> usize {
+    match tu {
+        TimeUnit::Second => 0,
+        TimeUnit::Millisecond => 1,
+        TimeUnit::Microsecond => 2,
+        TimeUnit::Nanosecond => 3,
+    }
+}
diff --git a/src/daft-json/src/lib.rs b/src/daft-json/src/lib.rs
new file mode 100644
index 0000000000..667a81469c
--- /dev/null
+++ b/src/daft-json/src/lib.rs
@@ -0,0 +1,73 @@
+#![feature(async_closure)]
+#![feature(let_chains)]
+#![feature(trait_alias)]
+#![feature(trait_upcasting)]
+use common_error::DaftError;
+use futures::stream::TryChunksError;
+use snafu::Snafu;
+
+mod decoding;
+mod inference;
+pub mod options;
+#[cfg(feature = "python")]
+pub mod python;
+pub mod read;
+pub mod schema;
+
+// pub use metadata::read_json_schema_bulk;
+pub use options::{JsonConvertOptions, JsonParseOptions, JsonReadOptions};
+#[cfg(feature = "python")]
+use pyo3::prelude::*;
+pub use read::{read_json, read_json_bulk};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("{source}"))]
+    IOError { source: daft_io::Error },
+    #[snafu(display("{source}"))]
+    StdIOError { source: std::io::Error },
+    #[snafu(display("{source}"))]
+    ArrowError { source: arrow2::error::Error },
+    #[snafu(display("JSON deserialization error: {}", string))]
+    JsonDeserializationError { string: String },
+    #[snafu(display("Error chunking: {}", source))]
+    ChunkError {
+        source: TryChunksError<String, std::io::Error>,
+    },
+    #[snafu(display("Error joining spawned task: {}", source))]
+    JoinError { source: tokio::task::JoinError },
+    #[snafu(display(
+        "Sender of OneShot Channel Dropped before sending data over: {}",
+        source
+    ))]
+    OneShotRecvError {
+        source: tokio::sync::oneshot::error::RecvError,
+    },
+}
+
+impl From<Error> for DaftError {
+    fn from(err: Error) -> DaftError {
+        match err {
+            Error::IOError { source } => source.into(),
+            _ => DaftError::External(err.into()),
+        }
+    }
+}
+
+impl From<daft_io::Error> for Error {
+    fn from(err: daft_io::Error) -> Self {
+        Error::IOError { source: err }
+    }
+}
+
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+#[cfg(feature = "python")]
+pub fn register_modules(_py: Python, parent: &PyModule) -> PyResult<()> {
+    parent.add_class::<JsonConvertOptions>()?;
+    parent.add_class::<JsonParseOptions>()?;
+    parent.add_class::<JsonReadOptions>()?;
+    parent.add_wrapped(wrap_pyfunction!(python::pylib::read_json))?;
+    parent.add_wrapped(wrap_pyfunction!(python::pylib::read_json_schema))?;
+    Ok(())
+}
diff --git a/src/daft-json/src/options.rs b/src/daft-json/src/options.rs
new file mode 100644
index 0000000000..d897c25db2
--- /dev/null
+++ b/src/daft-json/src/options.rs
@@ -0,0 +1,212 @@
+use daft_core::{impl_bincode_py_state_serialization, schema::SchemaRef};
+use serde::{Deserialize, Serialize};
+#[cfg(feature = "python")]
+use {
+    daft_core::python::schema::PySchema,
+    pyo3::{
+        pyclass, pyclass::CompareOp, pymethods, types::PyBytes, PyObject, PyResult, PyTypeInfo,
+        Python, ToPyObject,
+    },
+};
+
+/// Options for converting JSON data to Daft data.
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[cfg_attr(feature = "python", pyclass(module = "daft.daft"))]
+pub struct JsonConvertOptions {
+    pub limit: Option<usize>,
+    pub include_columns: Option<Vec<String>>,
+    pub schema: Option<SchemaRef>,
+}
+
+impl JsonConvertOptions {
+    pub fn new_internal(
+        limit: Option<usize>,
+        include_columns: Option<Vec<String>>,
+        schema: Option<SchemaRef>,
+    ) -> Self {
+        Self {
+            limit,
+            include_columns,
+            schema,
+        }
+    }
+
+    pub fn with_limit(self, limit: Option<usize>) -> Self {
+        Self { limit, ..self }
+    }
+
+    pub fn with_include_columns(self, include_columns: Option<Vec<String>>) -> Self {
+        Self {
+            include_columns,
+            ..self
+        }
+    }
+
+    pub fn with_schema(self, schema: Option<SchemaRef>) -> Self {
+        Self { schema, ..self }
+    }
+}
+
+impl Default for JsonConvertOptions {
+    fn default() -> Self {
+        Self::new_internal(None, None, None)
+    }
+}
+
+#[cfg(feature = "python")]
+#[pymethods]
+impl JsonConvertOptions {
+    /// Create conversion options for the JSON reader.
+    ///
+    /// # Arguments:
+    ///
+    /// * `limit` - Only read this many rows.
+    /// * `include_columns` - The names of the columns that should be kept, e.g. via a projection.
+    /// * `schema` - The names and dtypes for the JSON columns.
+    #[new]
+    #[pyo3(signature = (limit=None, include_columns=None, schema=None))]
+    pub fn new(
+        limit: Option<usize>,
+        include_columns: Option<Vec<String>>,
+        schema: Option<PySchema>,
+    ) -> Self {
+        Self::new_internal(limit, include_columns, schema.map(|s| s.into()))
+    }
+
+    #[getter]
+    pub fn get_limit(&self) -> PyResult<Option<usize>> {
+        Ok(self.limit)
+    }
+
+    #[getter]
+    pub fn get_include_columns(&self) -> PyResult<Option<Vec<String>>> {
+        Ok(self.include_columns.clone())
+    }
+
+    #[getter]
+    pub fn get_schema(&self) -> PyResult<Option<PySchema>> {
+        Ok(self.schema.as_ref().map(|s| s.clone().into()))
+    }
+
+    fn __richcmp__(&self, other: &Self, op: CompareOp) -> bool {
+        match op {
+            CompareOp::Eq => self == other,
+            CompareOp::Ne => !self.__richcmp__(other, CompareOp::Eq),
+            _ => unimplemented!("not implemented"),
+        }
+    }
+
+    pub fn __str__(&self) -> PyResult<String> {
+        Ok(format!("{:?}", self))
+    }
+}
+
+impl_bincode_py_state_serialization!(JsonConvertOptions);
+
+/// Options for parsing JSON files.
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[cfg_attr(feature = "python", pyclass(module = "daft.daft", get_all))]
+pub struct JsonParseOptions {}
+
+impl JsonParseOptions {
+    pub fn new_internal() -> Self {
+        Self {}
+    }
+}
+
+impl Default for JsonParseOptions {
+    fn default() -> Self {
+        Self::new_internal()
+    }
+}
+
+#[cfg(feature = "python")]
+#[pymethods]
+impl JsonParseOptions {
+    /// Create parsing options for the JSON reader.
+    #[new]
+    pub fn new() -> PyResult<Self> {
+        Ok(Self::new_internal())
+    }
+
+    fn __richcmp__(&self, other: &Self, op: CompareOp) -> bool {
+        match op {
+            CompareOp::Eq => self == other,
+            CompareOp::Ne => !self.__richcmp__(other, CompareOp::Eq),
+            _ => unimplemented!("not implemented"),
+        }
+    }
+
+    pub fn __str__(&self) -> PyResult<String> {
+        Ok(format!("{:?}", self))
+    }
+}
+
+impl_bincode_py_state_serialization!(JsonParseOptions);
+
+/// Options for reading JSON files.
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[cfg_attr(feature = "python", pyclass(module = "daft.daft", get_all))]
+pub struct JsonReadOptions {
+    pub buffer_size: Option<usize>,
+    pub chunk_size: Option<usize>,
+}
+
+impl JsonReadOptions {
+    pub fn new_internal(buffer_size: Option<usize>, chunk_size: Option<usize>) -> Self {
+        Self {
+            buffer_size,
+            chunk_size,
+        }
+    }
+
+    pub fn with_buffer_size(self, buffer_size: Option<usize>) -> Self {
+        Self {
+            buffer_size,
+            chunk_size: self.chunk_size,
+        }
+    }
+
+    pub fn with_chunk_size(self, chunk_size: Option<usize>) -> Self {
+        Self {
+            buffer_size: self.buffer_size,
+            chunk_size,
+        }
+    }
+}
+
+impl Default for JsonReadOptions {
+    fn default() -> Self {
+        Self::new_internal(None, None)
+    }
+}
+
+#[cfg(feature = "python")]
+#[pymethods]
+impl JsonReadOptions {
+    /// Create reading options for the JSON reader.
+    ///
+    /// # Arguments:
+    ///
+    /// * `buffer_size` - Size of the buffer (in bytes) used by the streaming reader.
+    /// * `chunk_size` - Size of the chunks (in bytes) deserialized in parallel by the streaming reader.
+    #[new]
+    #[pyo3(signature = (buffer_size=None, chunk_size=None))]
+    pub fn new(buffer_size: Option<usize>, chunk_size: Option<usize>) -> Self {
+        Self::new_internal(buffer_size, chunk_size)
+    }
+
+    fn __richcmp__(&self, other: &Self, op: CompareOp) -> bool {
+        match op {
+            CompareOp::Eq => self == other,
+            CompareOp::Ne => !self.__richcmp__(other, CompareOp::Eq),
+            _ => unimplemented!("not implemented"),
+        }
+    }
+
+    pub fn __str__(&self) -> PyResult<String> {
+        Ok(format!("{:?}", self))
+    }
+}
+
+impl_bincode_py_state_serialization!(JsonReadOptions);
diff --git a/src/daft-json/src/python.rs b/src/daft-json/src/python.rs
new file mode 100644
index 0000000000..139ba4a531
--- /dev/null
+++ b/src/daft-json/src/python.rs
@@ -0,0 +1,70 @@
+pub mod pylib {
+    use std::sync::Arc;
+
+    use daft_core::python::schema::PySchema;
+    use daft_io::{get_io_client, python::IOConfig, IOStatsContext};
+    use daft_table::python::PyTable;
+    use pyo3::{pyfunction, PyResult, Python};
+
+    use crate::{JsonConvertOptions, JsonParseOptions, JsonReadOptions};
+
+    #[allow(clippy::too_many_arguments)]
+    #[pyfunction]
+    pub fn read_json(
+        py: Python,
+        uri: &str,
+        convert_options: Option<JsonConvertOptions>,
+        parse_options: Option<JsonParseOptions>,
+        read_options: Option<JsonReadOptions>,
+        io_config: Option<IOConfig>,
+        multithreaded_io: Option<bool>,
+        max_chunks_in_flight: Option<usize>,
+    ) -> PyResult<PyTable> {
+        py.allow_threads(|| {
+            let io_stats = IOStatsContext::new(format!("read_json: for uri {uri}"));
+
+            let io_client = get_io_client(
+                multithreaded_io.unwrap_or(true),
+                io_config.unwrap_or_default().config.into(),
+            )?;
+            Ok(crate::read::read_json(
+                uri,
+                convert_options,
+                parse_options,
+                read_options,
+                io_client,
+                Some(io_stats),
+                multithreaded_io.unwrap_or(true),
+                max_chunks_in_flight,
+            )?
+            .into())
+        })
+    }
+
+    #[pyfunction]
+    pub fn read_json_schema(
+        py: Python,
+        uri: &str,
+        parse_options: Option<JsonParseOptions>,
+        max_bytes: Option<usize>,
+        io_config: Option<IOConfig>,
+        multithreaded_io: Option<bool>,
+    ) -> PyResult<PySchema> {
+        py.allow_threads(|| {
+            let io_stats = IOStatsContext::new(format!("read_json_schema: for uri {uri}"));
+
+            let io_client = get_io_client(
+                multithreaded_io.unwrap_or(true),
+                io_config.unwrap_or_default().config.into(),
+            )?;
+            let schema = crate::schema::read_json_schema(
+                uri,
+                parse_options,
+                max_bytes,
+                io_client,
+                Some(io_stats),
+            )?;
+            Ok(Arc::new(schema).into())
+        })
+    }
+}
diff --git a/src/daft-json/src/read.rs b/src/daft-json/src/read.rs
new file mode 100644
index 0000000000..afbdefa13b
--- /dev/null
+++ b/src/daft-json/src/read.rs
@@ -0,0 +1,1218 @@
+use std::{collections::HashMap, num::NonZeroUsize, sync::Arc};
+
+use common_error::{DaftError, DaftResult};
+use daft_core::{schema::Schema, utils::arrow::cast_array_for_daft_if_needed, Series};
+use daft_io::{get_runtime, GetResult, IOClient, IOStatsRef};
+use daft_table::Table;
+use futures::{Stream, StreamExt, TryStreamExt};
+use rayon::prelude::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
+use snafu::{
+    futures::{try_future::Context, TryFutureExt, TryStreamExt as _},
+    ResultExt,
+};
+use tokio::{
+    fs::File,
+    io::{AsyncBufRead, AsyncBufReadExt, BufReader},
+    task::JoinHandle,
+};
+use tokio_util::io::StreamReader;
+
+use crate::{decoding::deserialize_records, ArrowSnafu, ChunkSnafu};
+use crate::{
+    schema::read_json_schema_single, JsonConvertOptions, JsonParseOptions, JsonReadOptions,
+};
+use daft_compression::CompressionCodec;
+
+trait LineChunkStream = Stream<Item = super::Result<Vec<String>>>;
+trait ColumnArrayChunkStream = Stream<
+    Item = super::Result<
+        Context<
+            JoinHandle<DaftResult<Vec<Box<dyn arrow2::array::Array>>>>,
+            super::JoinSnafu,
+            super::Error,
+        >,
+    >,
+>;
+
+#[allow(clippy::too_many_arguments)]
+pub fn read_json(
+    uri: &str,
+    convert_options: Option<JsonConvertOptions>,
+    parse_options: Option<JsonParseOptions>,
+    read_options: Option<JsonReadOptions>,
+    io_client: Arc<IOClient>,
+    io_stats: Option<IOStatsRef>,
+    multithreaded_io: bool,
+    max_chunks_in_flight: Option<usize>,
+) -> DaftResult<Table> {
+    let runtime_handle = get_runtime(multithreaded_io)?;
+    let _rt_guard = runtime_handle.enter();
+    runtime_handle.block_on(async {
+        read_json_single_into_table(
+            uri,
+            convert_options,
+            parse_options,
+            read_options,
+            io_client,
+            io_stats,
+            max_chunks_in_flight,
+        )
+        .await
+    })
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn read_json_bulk(
+    uris: &[&str],
+    convert_options: Option<JsonConvertOptions>,
+    parse_options: Option<JsonParseOptions>,
+    read_options: Option<JsonReadOptions>,
+    io_client: Arc<IOClient>,
+    io_stats: Option<IOStatsRef>,
+    multithreaded_io: bool,
+    max_chunks_in_flight: Option<usize>,
+    num_parallel_tasks: usize,
+) -> DaftResult<Vec<Table>> {
+    let runtime_handle = get_runtime(multithreaded_io)?;
+    let _rt_guard = runtime_handle.enter();
+    let tables = runtime_handle.block_on(async move {
+        // Launch a read task per URI, throttling the number of concurrent file reads to num_parallel tasks.
+        let task_stream = futures::stream::iter(uris.iter().map(|uri| {
+            let (uri, convert_options, parse_options, read_options, io_client, io_stats) = (
+                uri.to_string(),
+                convert_options.clone(),
+                parse_options.clone(),
+                read_options.clone(),
+                io_client.clone(),
+                io_stats.clone(),
+            );
+            tokio::task::spawn(async move {
+                let table = read_json_single_into_table(
+                    uri.as_str(),
+                    convert_options,
+                    parse_options,
+                    read_options,
+                    io_client,
+                    io_stats,
+                    max_chunks_in_flight,
+                )
+                .await?;
+                DaftResult::Ok(table)
+            })
+            .context(crate::JoinSnafu)
+        }));
+        let mut remaining_rows = convert_options
+            .as_ref()
+            .and_then(|opts| opts.limit.map(|limit| limit as i64));
+        task_stream
+            // Limit the number of file reads we have in flight at any given time.
+            .buffered(num_parallel_tasks)
+            // Terminate the stream if we have already reached the row limit. With the upstream buffering, we will still read up to
+            // num_parallel_tasks redundant files.
+            .try_take_while(|result| {
+                match (result, remaining_rows) {
+                    // Limit has been met, early-teriminate.
+                    (_, Some(rows_left)) if rows_left <= 0 => futures::future::ready(Ok(false)),
+                    // Limit has not yet been met, update remaining limit slack and continue.
+                    (Ok(table), Some(rows_left)) => {
+                        remaining_rows = Some(rows_left - table.len() as i64);
+                        futures::future::ready(Ok(true))
+                    }
+                    // (1) No limit, never early-terminate.
+                    // (2) Encountered error, propagate error to try_collect to allow it to short-circuit.
+                    (_, None) | (Err(_), _) => futures::future::ready(Ok(true)),
+                }
+            })
+            .try_collect::<Vec<_>>()
+            .await
+    })?;
+    tables.into_iter().collect::<DaftResult<Vec<_>>>()
+}
+
+async fn read_json_single_into_table(
+    uri: &str,
+    convert_options: Option<JsonConvertOptions>,
+    parse_options: Option<JsonParseOptions>,
+    read_options: Option<JsonReadOptions>,
+    io_client: Arc<IOClient>,
+    io_stats: Option<IOStatsRef>,
+    max_chunks_in_flight: Option<usize>,
+) -> DaftResult<Table> {
+    let (chunk_stream, schema) = read_json_single_into_stream(
+        uri,
+        convert_options.unwrap_or_default(),
+        parse_options.unwrap_or_default(),
+        read_options,
+        io_client,
+        io_stats,
+    )
+    .await?;
+    // Default max chunks in flight is set to 2x the number of cores, which should ensure pipelining of reading chunks
+    // with the parsing of chunks on the rayon threadpool.
+    let max_chunks_in_flight = max_chunks_in_flight.unwrap_or_else(|| {
+        std::thread::available_parallelism()
+            .unwrap_or(NonZeroUsize::new(2).unwrap())
+            .checked_mul(2.try_into().unwrap())
+            .unwrap()
+            .try_into()
+            .unwrap()
+    });
+    // Collect all chunks in chunk x column form.
+    let chunks = chunk_stream
+        // Limit the number of chunks we have in flight at any given time.
+        .try_buffered(max_chunks_in_flight)
+        .try_collect::<Vec<_>>()
+        .await?
+        .into_iter()
+        .collect::<DaftResult<Vec<_>>>()?;
+    // Handle empty table case.
+    if chunks.is_empty() {
+        let daft_schema = Arc::new(Schema::try_from(&schema)?);
+        return Table::empty(Some(daft_schema));
+    }
+    // Transpose chunk x column into column x chunk.
+    let mut column_arrays = vec![Vec::with_capacity(chunks.len()); chunks[0].len()];
+    for chunk in chunks.into_iter() {
+        for (idx, col) in chunk.into_iter().enumerate() {
+            column_arrays[idx].push(col);
+        }
+    }
+    // Build table from chunks.
+    // TODO(Clark): Don't concatenate all chunks from a file into a single table, since MicroPartition is natively chunked.
+    chunks_to_table(column_arrays, schema)
+}
+
+async fn read_json_single_into_stream(
+    uri: &str,
+    convert_options: JsonConvertOptions,
+    parse_options: JsonParseOptions,
+    read_options: Option<JsonReadOptions>,
+    io_client: Arc<IOClient>,
+    io_stats: Option<IOStatsRef>,
+) -> DaftResult<(
+    impl ColumnArrayChunkStream + Send,
+    arrow2::datatypes::Schema,
+)> {
+    let schema = match convert_options.schema {
+        Some(schema) => schema.to_arrow()?,
+        None => read_json_schema_single(
+            uri,
+            parse_options.clone(),
+            // Read at most 1 MiB when doing schema inference.
+            Some(1024 * 1024),
+            io_client.clone(),
+            io_stats.clone(),
+        )
+        .await?
+        .to_arrow()?,
+    };
+    let (reader, buffer_size, chunk_size): (Box<dyn AsyncBufRead + Unpin + Send>, usize, usize) =
+        match io_client
+            .single_url_get(uri.to_string(), None, io_stats)
+            .await?
+        {
+            GetResult::File(file) => {
+                // Use user-provided buffer size, falling back to 8 * the user-provided chunk size if that exists, otherwise falling back to 512 KiB as the default.
+                let buffer_size = read_options
+                    .as_ref()
+                    .and_then(|opt| {
+                        opt.buffer_size
+                            .or_else(|| opt.chunk_size.map(|cs| (64 * cs).min(256 * 1024 * 1024)))
+                    })
+                    .unwrap_or(256 * 1024);
+                (
+                    Box::new(BufReader::with_capacity(
+                        buffer_size,
+                        File::open(file.path).await?,
+                    )),
+                    buffer_size,
+                    read_options
+                        .as_ref()
+                        .and_then(|opt| {
+                            opt.chunk_size
+                                .or_else(|| opt.buffer_size.map(|bs| (bs / 64).max(16)))
+                        })
+                        .unwrap_or(64),
+                )
+            }
+            GetResult::Stream(stream, _, _) => (
+                Box::new(StreamReader::new(stream)),
+                // Use user-provided buffer size, falling back to 8 * the user-provided chunk size if that exists, otherwise falling back to 512 KiB as the default.
+                read_options
+                    .as_ref()
+                    .and_then(|opt| {
+                        opt.buffer_size
+                            .or_else(|| opt.chunk_size.map(|cs| (256 * cs).min(256 * 1024 * 1024)))
+                    })
+                    .unwrap_or(8 * 1024 * 1024),
+                read_options
+                    .as_ref()
+                    .and_then(|opt| {
+                        opt.chunk_size
+                            .or_else(|| opt.buffer_size.map(|bs| (bs / 256).max(16)))
+                    })
+                    .unwrap_or(64),
+            ),
+        };
+    // If file is compressed, wrap stream in decoding stream.
+    let reader: Box<dyn AsyncBufRead + Unpin + Send> = match CompressionCodec::from_uri(uri) {
+        Some(compression) => Box::new(tokio::io::BufReader::with_capacity(
+            buffer_size,
+            compression.to_decoder(reader),
+        )),
+        None => reader,
+    };
+    let read_stream = read_into_line_chunk_stream(reader, convert_options.limit, chunk_size);
+    let (projected_schema, schema_is_projection) = match convert_options.include_columns {
+        Some(projection) => {
+            let mut field_map = schema
+                .fields
+                .into_iter()
+                .map(|f| (f.name.clone(), f))
+                .collect::<HashMap<_, _>>();
+            let projected_fields = projection.into_iter().map(|col| field_map.remove(col.as_str()).ok_or(DaftError::ValueError(format!("Column {} in the projection doesn't exist in the JSON file; existing columns = {:?}", col, field_map.keys())))).collect::<DaftResult<Vec<_>>>()?;
+            (
+                arrow2::datatypes::Schema::from(projected_fields).with_metadata(schema.metadata),
+                true,
+            )
+        }
+        None => (schema, false),
+    };
+    Ok((
+        parse_into_column_array_chunk_stream(
+            read_stream,
+            projected_schema.clone().into(),
+            schema_is_projection,
+        ),
+        projected_schema,
+    ))
+}
+
+fn read_into_line_chunk_stream<R>(
+    reader: R,
+    num_rows: Option<usize>,
+    chunk_size: usize,
+) -> impl LineChunkStream + Send
+where
+    R: AsyncBufRead + Unpin + Send + 'static,
+{
+    let num_rows = num_rows.unwrap_or(usize::MAX);
+    // Stream of unparsed json string record chunks.
+    let line_stream = tokio_stream::wrappers::LinesStream::new(reader.lines());
+    line_stream
+        .take(num_rows)
+        .try_chunks(chunk_size)
+        .context(ChunkSnafu)
+}
+
+fn parse_into_column_array_chunk_stream(
+    stream: impl LineChunkStream + Send,
+    schema: Arc<arrow2::datatypes::Schema>,
+    schema_is_projection: bool,
+) -> impl ColumnArrayChunkStream + Send {
+    // Parsing stream: we spawn background tokio + rayon tasks so we can pipeline chunk parsing with chunk reading, and
+    // we further parse each chunk column in parallel on the rayon threadpool.
+    stream.map_ok(move |records| {
+        let schema = schema.clone();
+        tokio::spawn(async move {
+            let (send, recv) = tokio::sync::oneshot::channel();
+            rayon::spawn(move || {
+                let result = (move || {
+                    // TODO(Clark): Switch to streaming parse + array construction?
+                    let parsed = records
+                        .iter()
+                        .map(|unparsed_record| {
+                            json_deserializer::parse(unparsed_record.as_bytes()).map_err(|e| {
+                                super::Error::JsonDeserializationError {
+                                    string: e.to_string(),
+                                }
+                            })
+                        })
+                        .collect::<super::Result<Vec<_>>>()?;
+                    let chunk = deserialize_records(parsed, schema.as_ref(), schema_is_projection)
+                        .context(ArrowSnafu)?;
+                    Ok(chunk)
+                })();
+                let _ = send.send(result);
+            });
+            recv.await.context(super::OneShotRecvSnafu {})?
+        })
+        .context(super::JoinSnafu {})
+    })
+}
+
+fn chunks_to_table(
+    chunks: Vec<Vec<Box<dyn arrow2::array::Array>>>,
+    schema: arrow2::datatypes::Schema,
+) -> DaftResult<Table> {
+    // Concatenate column chunks and convert into Daft Series.
+    // Note that this concatenation is done in parallel on the rayon threadpool.
+    let columns_series = chunks
+        .into_par_iter()
+        .zip(&schema.fields)
+        .map(|(mut arrays, field)| {
+            let array = if arrays.len() > 1 {
+                // Concatenate all array chunks.
+                let unboxed_arrays = arrays.iter().map(Box::as_ref).collect::<Vec<_>>();
+                arrow2::compute::concatenate::concatenate(unboxed_arrays.as_slice())?
+            } else {
+                // Return single array chunk directly.
+                arrays.pop().unwrap()
+            };
+            Series::try_from((field.name.as_ref(), cast_array_for_daft_if_needed(array)))
+        })
+        .collect::<DaftResult<Vec<Series>>>()?;
+    // Build Daft Table.
+    let daft_schema = Schema::try_from(&schema)?;
+    Table::new(daft_schema, columns_series)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashSet, io::BufRead, sync::Arc};
+
+    use common_error::DaftResult;
+
+    use daft_core::{
+        datatypes::{Field, TimeUnit},
+        schema::Schema,
+        utils::arrow::{cast_array_for_daft_if_needed, cast_array_from_daft_if_needed},
+        DataType,
+    };
+    use daft_io::{IOClient, IOConfig};
+    use daft_table::Table;
+    use indexmap::IndexMap;
+    use rstest::rstest;
+
+    use crate::{
+        decoding::deserialize_records,
+        inference::{column_types_map_to_fields, infer_records_schema},
+    };
+    use crate::{JsonConvertOptions, JsonReadOptions};
+
+    use super::read_json;
+
+    fn check_equal_local_arrow2(
+        path: &str,
+        out: &Table,
+        limit: Option<usize>,
+        projection: Option<Vec<String>>,
+    ) {
+        let reader = std::io::BufReader::new(std::fs::File::open(path).unwrap());
+        let lines = reader.lines().collect::<Vec<_>>();
+        let parsed = lines
+            .iter()
+            .take(limit.unwrap_or(usize::MAX))
+            .map(|record| json_deserializer::parse(record.as_ref().unwrap().as_bytes()).unwrap())
+            .collect::<Vec<_>>();
+        // Get consolidated schema from parsed JSON.
+        let mut column_types: IndexMap<String, HashSet<arrow2::datatypes::DataType>> =
+            IndexMap::new();
+        parsed.iter().for_each(|record| {
+            let schema = infer_records_schema(record).unwrap();
+            for field in schema.fields {
+                match column_types.entry(field.name) {
+                    indexmap::map::Entry::Occupied(mut v) => {
+                        v.get_mut().insert(field.data_type);
+                    }
+                    indexmap::map::Entry::Vacant(v) => {
+                        let mut a = HashSet::new();
+                        a.insert(field.data_type);
+                        v.insert(a);
+                    }
+                }
+            }
+        });
+        let fields = column_types_map_to_fields(column_types);
+        let schema: arrow2::datatypes::Schema = fields.into();
+        // Apply projection to schema.
+        let mut field_map = schema
+            .fields
+            .iter()
+            .map(|f| (f.name.clone(), f.clone()))
+            .collect::<IndexMap<_, _>>();
+        let (schema, is_projection) = match &projection {
+            Some(projection) => (
+                projection
+                    .iter()
+                    .map(|c| field_map.remove(c.as_str()).unwrap())
+                    .collect::<Vec<_>>()
+                    .into(),
+                true,
+            ),
+            None => (
+                field_map
+                    .into_values()
+                    .into_iter()
+                    .collect::<Vec<_>>()
+                    .into(),
+                false,
+            ),
+        };
+        // Deserialize JSON records into Arrow2 column arrays.
+        let columns = deserialize_records(parsed, &schema, is_projection).unwrap();
+        // Roundtrip columns with Daft for casting.
+        let columns = columns
+            .into_iter()
+            .map(|c| cast_array_from_daft_if_needed(cast_array_for_daft_if_needed(c)))
+            .collect::<Vec<_>>();
+        // Roundtrip schema with Daft for casting.
+        let schema = Schema::try_from(&schema).unwrap().to_arrow().unwrap();
+        assert_eq!(out.schema.to_arrow().unwrap(), schema);
+        let out_columns = (0..out.num_columns())
+            .map(|i| out.get_column_by_index(i).unwrap().to_arrow())
+            .collect::<Vec<_>>();
+        assert_eq!(out_columns, columns);
+    }
+
+    #[rstest]
+    fn test_json_read_local(
+        #[values(
+            // Uncompressed
+            None,
+            // brotli
+            Some("br"),
+            // bzip2
+            Some("bz2"),
+            // TODO(Clark): Add deflate compressed JSON file to test data fixtures.
+            // // deflate
+            // Some("deflate"),
+            // gzip
+            Some("gz"),
+            // lzma
+            Some("lzma"),
+            // xz
+            Some("xz"),
+            // zlib
+            Some("zl"),
+            // zstd
+            Some("zst"),
+        )]
+        compression: Option<&str>,
+    ) -> DaftResult<()> {
+        let file = format!(
+            "{}/test/iris_tiny.jsonl{}",
+            env!("CARGO_MANIFEST_DIR"),
+            compression.map_or("".to_string(), |ext| format!(".{}", ext))
+        );
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(file.as_ref(), None, None, None, io_client, None, true, None)?;
+        assert_eq!(table.len(), 20);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+        if compression.is_none() {
+            check_equal_local_arrow2(file.as_ref(), &table, None, None);
+        }
+
+        Ok(())
+    }
+
+    #[rstest]
+    fn test_json_read_local_dtypes() -> DaftResult<()> {
+        let file = format!("{}/test/dtypes.jsonl", env!("CARGO_MANIFEST_DIR"),);
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(file.as_ref(), None, None, None, io_client, None, true, None)?;
+        assert_eq!(table.len(), 4);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("int", DataType::Int64),
+                Field::new("float", DataType::Float64),
+                Field::new("bool", DataType::Boolean),
+                Field::new("str", DataType::Utf8),
+                Field::new("null", DataType::Null),
+                Field::new("date", DataType::Date),
+                // TODO(Clark): Add coverage for time parsing once we add support for representing time series in Daft.
+                // // Time unit should be coarest granularity found in file, i.e. seconds.
+                // Field::new("time", DataType::Time(TimeUnit::Nanoseconds)),
+                // Time unit should be coarsest granularity found in file, i.e. seconds due to naive date inclusion.
+                Field::new(
+                    "naive_timestamp",
+                    DataType::Timestamp(TimeUnit::Seconds, None)
+                ),
+                // Timezone should be UTC due to field having multiple different timezones across records.
+                // Time unit should be coarsest granularity found in file, i.e. milliseconds.
+                Field::new(
+                    "timestamp",
+                    DataType::Timestamp(TimeUnit::Milliseconds, Some("Z".to_string()))
+                ),
+                Field::new("list", DataType::List(Box::new(DataType::Int64))),
+                Field::new(
+                    "obj",
+                    DataType::Struct(vec![
+                        Field::new("a", DataType::Int64),
+                        Field::new("b", DataType::Boolean)
+                    ])
+                ),
+                Field::new(
+                    "nested_list",
+                    DataType::List(Box::new(DataType::List(Box::new(DataType::Struct(vec![
+                        Field::new("a", DataType::Utf8),
+                    ])))))
+                ),
+                Field::new(
+                    "nested_obj",
+                    DataType::Struct(vec![
+                        Field::new(
+                            "obj",
+                            DataType::Struct(vec![Field::new("a", DataType::Int64)])
+                        ),
+                        Field::new("list", DataType::List(Box::new(DataType::Int64))),
+                    ])
+                ),
+            ])?
+            .into(),
+        );
+        check_equal_local_arrow2(file.as_ref(), &table, None, None);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_local_limit() -> DaftResult<()> {
+        let file = format!("{}/test/iris_tiny.jsonl", env!("CARGO_MANIFEST_DIR"),);
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(
+            file.as_ref(),
+            Some(JsonConvertOptions::default().with_limit(Some(5))),
+            None,
+            None,
+            io_client,
+            None,
+            true,
+            None,
+        )?;
+        assert_eq!(table.len(), 5);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+        check_equal_local_arrow2(file.as_ref(), &table, Some(5), None);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_local_projection() -> DaftResult<()> {
+        let file = format!("{}/test/iris_tiny.jsonl", env!("CARGO_MANIFEST_DIR"),);
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(
+            file.as_ref(),
+            Some(
+                JsonConvertOptions::default().with_include_columns(Some(vec![
+                    "petalWidth".to_string(),
+                    "petalLength".to_string(),
+                ])),
+            ),
+            None,
+            None,
+            io_client,
+            None,
+            true,
+            None,
+        )?;
+        assert_eq!(table.len(), 20);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+            ])?
+            .into(),
+        );
+        check_equal_local_arrow2(
+            file.as_ref(),
+            &table,
+            None,
+            Some(vec!["petalWidth".to_string(), "petalLength".to_string()]),
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_local_larger_than_buffer_size() -> DaftResult<()> {
+        let file = format!("{}/test/iris_tiny.jsonl", env!("CARGO_MANIFEST_DIR"),);
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(
+            file.as_ref(),
+            None,
+            None,
+            Some(JsonReadOptions::default().with_buffer_size(Some(128))),
+            io_client,
+            None,
+            true,
+            None,
+        )?;
+        assert_eq!(table.len(), 20);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+        check_equal_local_arrow2(file.as_ref(), &table, None, None);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_local_larger_than_chunk_size() -> DaftResult<()> {
+        let file = format!("{}/test/iris_tiny.jsonl", env!("CARGO_MANIFEST_DIR"),);
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(
+            file.as_ref(),
+            None,
+            None,
+            Some(JsonReadOptions::default().with_chunk_size(Some(5))),
+            io_client,
+            None,
+            true,
+            None,
+        )?;
+        assert_eq!(table.len(), 20);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+        check_equal_local_arrow2(file.as_ref(), &table, None, None);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_local_throttled_streaming() -> DaftResult<()> {
+        let file = format!("{}/test/iris_tiny.jsonl", env!("CARGO_MANIFEST_DIR"),);
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(
+            file.as_ref(),
+            None,
+            None,
+            Some(JsonReadOptions::default().with_chunk_size(Some(5))),
+            io_client,
+            None,
+            true,
+            Some(2),
+        )?;
+        assert_eq!(table.len(), 20);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+        check_equal_local_arrow2(file.as_ref(), &table, None, None);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_local_nulls() -> DaftResult<()> {
+        let file = format!("{}/test/iris_tiny_nulls.jsonl", env!("CARGO_MANIFEST_DIR"),);
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(file.as_ref(), None, None, None, io_client, None, true, None)?;
+        assert_eq!(table.len(), 6);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+        check_equal_local_arrow2(file.as_ref(), &table, None, None);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_local_all_null_column() -> DaftResult<()> {
+        let file = format!(
+            "{}/test/iris_tiny_all_null_column.jsonl",
+            env!("CARGO_MANIFEST_DIR"),
+        );
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(file.as_ref(), None, None, None, io_client, None, true, None)?;
+        assert_eq!(table.len(), 6);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                // All null column parsed as null dtype.
+                Field::new("petalLength", DataType::Null),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+        let null_column = table.get_column("petalLength")?;
+        assert_eq!(null_column.data_type(), &DataType::Null);
+        assert_eq!(null_column.len(), 6);
+        assert_eq!(
+            null_column.to_arrow(),
+            Box::new(arrow2::array::NullArray::new(
+                arrow2::datatypes::DataType::Null,
+                6
+            )) as Box<dyn arrow2::array::Array>
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_local_all_null_column_with_schema() -> DaftResult<()> {
+        let file = format!(
+            "{}/test/iris_tiny_all_null_column.jsonl",
+            env!("CARGO_MANIFEST_DIR"),
+        );
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let schema = Schema::new(vec![
+            Field::new("sepalLength", DataType::Float64),
+            Field::new("sepalWidth", DataType::Float64),
+            // Manually specify this column as all-null.
+            Field::new("petalLength", DataType::Null),
+            Field::new("petalWidth", DataType::Float64),
+            Field::new("species", DataType::Utf8),
+        ])?;
+        let table = read_json(
+            file.as_ref(),
+            Some(JsonConvertOptions::default().with_schema(Some(schema.into()))),
+            None,
+            None,
+            io_client,
+            None,
+            true,
+            None,
+        )?;
+        assert_eq!(table.len(), 6);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                // All null column parsed as null dtype.
+                Field::new("petalLength", DataType::Null),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+        let null_column = table.get_column("petalLength")?;
+        assert_eq!(null_column.data_type(), &DataType::Null);
+        assert_eq!(null_column.len(), 6);
+        assert_eq!(
+            null_column.to_arrow(),
+            Box::new(arrow2::array::NullArray::new(
+                arrow2::datatypes::DataType::Null,
+                6
+            )) as Box<dyn arrow2::array::Array>
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_local_all_null_column_with_schema_well_typed() -> DaftResult<()> {
+        let file = format!(
+            "{}/test/iris_tiny_all_null_column.jsonl",
+            env!("CARGO_MANIFEST_DIR"),
+        );
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+        let schema = Schema::new(vec![
+            Field::new("sepalLength", DataType::Float64),
+            Field::new("sepalWidth", DataType::Float64),
+            // Provide a manual type for the all-null column.
+            Field::new("petalLength", DataType::Float64),
+            Field::new("petalWidth", DataType::Float64),
+            Field::new("species", DataType::Utf8),
+        ])?;
+
+        let table = read_json(
+            file.as_ref(),
+            Some(JsonConvertOptions::default().with_schema(Some(schema.into()))),
+            None,
+            None,
+            io_client,
+            None,
+            true,
+            None,
+        )?;
+        assert_eq!(table.len(), 6);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                // All null column should have hte provided dtype.
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+        let null_column = table.get_column("petalLength")?;
+        assert_eq!(null_column.data_type(), &DataType::Float64);
+        assert_eq!(null_column.len(), 6);
+        assert_eq!(null_column.to_arrow().null_count(), 6);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_local_wrong_type_yields_nulls() -> DaftResult<()> {
+        let file = format!("{}/test/iris_tiny.jsonl", env!("CARGO_MANIFEST_DIR"),);
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let schema = Schema::new(vec![
+            // Conversion to all of these types should fail, resulting in nulls.
+            Field::new("sepalLength", DataType::Boolean),
+            Field::new("sepalWidth", DataType::Boolean),
+            Field::new("petalLength", DataType::Boolean),
+            Field::new("petalWidth", DataType::Boolean),
+            Field::new("species", DataType::Int64),
+        ])?;
+        let table = read_json(
+            file.as_ref(),
+            Some(JsonConvertOptions::default().with_schema(Some(schema.into()))),
+            None,
+            None,
+            io_client,
+            None,
+            true,
+            None,
+        )?;
+        let num_rows = table.len();
+        assert_eq!(num_rows, 20);
+        // Check that all columns are all null.
+        for idx in 0..table.num_columns() {
+            let column = table.get_column_by_index(idx)?;
+            assert_eq!(column.to_arrow().null_count(), num_rows);
+        }
+
+        Ok(())
+    }
+
+    #[rstest]
+    fn test_json_read_s3(
+        #[values(
+            // Uncompressed
+            None,
+            // brotli
+            Some("br"),
+            // bzip2
+            Some("bz2"),
+            // TODO(Clark): Add deflate compressed JSON file to test data fixtures.
+            // // deflate
+            // Some("deflate"),
+            // gzip
+            Some("gz"),
+            // lzma
+            Some("lzma"),
+            // xz
+            Some("xz"),
+            // zlib
+            Some("zl"),
+            // zstd
+            Some("zst"),
+        )]
+        compression: Option<&str>,
+    ) -> DaftResult<()> {
+        let file = format!(
+            "s3://daft-public-data/test_fixtures/json-dev/iris_tiny.jsonl{}",
+            compression.map_or("".to_string(), |ext| format!(".{}", ext))
+        );
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(file.as_ref(), None, None, None, io_client, None, true, None)?;
+        assert_eq!(table.len(), 20);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_s3_limit() -> DaftResult<()> {
+        let file = "s3://daft-public-data/test_fixtures/json-dev/iris_tiny.jsonl";
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(
+            file.as_ref(),
+            Some(JsonConvertOptions::default().with_limit(Some(5))),
+            None,
+            None,
+            io_client,
+            None,
+            true,
+            None,
+        )?;
+        assert_eq!(table.len(), 5);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_s3_projection() -> DaftResult<()> {
+        let file = "s3://daft-public-data/test_fixtures/json-dev/iris_tiny.jsonl";
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(
+            file.as_ref(),
+            Some(
+                JsonConvertOptions::default().with_include_columns(Some(vec![
+                    "petalWidth".to_string(),
+                    "petalLength".to_string(),
+                ])),
+            ),
+            None,
+            None,
+            io_client,
+            None,
+            true,
+            None,
+        )?;
+        assert_eq!(table.len(), 20);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+            ])?
+            .into(),
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_s3_larger_than_buffer_size() -> DaftResult<()> {
+        let file = "s3://daft-public-data/test_fixtures/json-dev/iris_tiny.jsonl";
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(
+            file.as_ref(),
+            None,
+            None,
+            Some(JsonReadOptions::default().with_buffer_size(Some(128))),
+            io_client,
+            None,
+            true,
+            None,
+        )?;
+        assert_eq!(table.len(), 20);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_s3_larger_than_chunk_size() -> DaftResult<()> {
+        let file = "s3://daft-public-data/test_fixtures/json-dev/iris_tiny.jsonl";
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(
+            file.as_ref(),
+            None,
+            None,
+            Some(JsonReadOptions::default().with_chunk_size(Some(5))),
+            io_client,
+            None,
+            true,
+            None,
+        )?;
+        assert_eq!(table.len(), 20);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_read_s3_throttled_streaming() -> DaftResult<()> {
+        let file = "s3://daft-public-data/test_fixtures/json-dev/iris_tiny.jsonl";
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let table = read_json(
+            file.as_ref(),
+            None,
+            None,
+            Some(JsonReadOptions::default().with_chunk_size(Some(5))),
+            io_client,
+            None,
+            true,
+            Some(2),
+        )?;
+        assert_eq!(table.len(), 20);
+        assert_eq!(
+            table.schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+            .into(),
+        );
+
+        Ok(())
+    }
+}
diff --git a/src/daft-json/src/schema.rs b/src/daft-json/src/schema.rs
new file mode 100644
index 0000000000..4b0c1c4c75
--- /dev/null
+++ b/src/daft-json/src/schema.rs
@@ -0,0 +1,449 @@
+use std::{collections::HashSet, sync::Arc};
+
+use common_error::DaftResult;
+use daft_core::schema::Schema;
+use daft_io::{get_runtime, GetResult, IOClient, IOStatsRef};
+use futures::{StreamExt, TryStreamExt};
+use indexmap::IndexMap;
+use json_deserializer::parse;
+use snafu::ResultExt;
+use tokio::{
+    fs::File,
+    io::{AsyncBufRead, AsyncBufReadExt, BufReader},
+};
+use tokio_util::io::StreamReader;
+
+use crate::{
+    inference::{column_types_map_to_fields, infer_records_schema},
+    ArrowSnafu, JsonParseOptions, StdIOSnafu,
+};
+use daft_compression::CompressionCodec;
+
+#[derive(Debug, Clone)]
+pub struct JsonReadStats {
+    pub total_bytes_read: usize,
+    pub total_records_read: usize,
+    pub mean_record_size_bytes: f64,
+    pub stddev_record_size_bytes: f64,
+}
+
+impl JsonReadStats {
+    pub fn new(
+        total_bytes_read: usize,
+        total_records_read: usize,
+        mean_record_size_bytes: f64,
+        stddev_record_size_bytes: f64,
+    ) -> Self {
+        Self {
+            total_bytes_read,
+            total_records_read,
+            mean_record_size_bytes,
+            stddev_record_size_bytes,
+        }
+    }
+}
+
+impl Default for JsonReadStats {
+    fn default() -> Self {
+        Self::new(0, 0, 0f64, 0f64)
+    }
+}
+
+pub fn read_json_schema(
+    uri: &str,
+    parse_options: Option<JsonParseOptions>,
+    max_bytes: Option<usize>,
+    io_client: Arc<IOClient>,
+    io_stats: Option<IOStatsRef>,
+) -> DaftResult<Schema> {
+    let runtime_handle = get_runtime(true)?;
+    let _rt_guard = runtime_handle.enter();
+    runtime_handle.block_on(async {
+        read_json_schema_single(
+            uri,
+            parse_options.unwrap_or_default(),
+            // Default to 1 MiB.
+            max_bytes.or(Some(1024 * 1024)),
+            io_client,
+            io_stats,
+        )
+        .await
+    })
+}
+
+pub async fn read_json_schema_bulk(
+    uris: &[&str],
+    parse_options: Option<JsonParseOptions>,
+    max_bytes: Option<usize>,
+    io_client: Arc<IOClient>,
+    io_stats: Option<IOStatsRef>,
+    num_parallel_tasks: usize,
+) -> DaftResult<Vec<Schema>> {
+    let runtime_handle = get_runtime(true)?;
+    let _rt_guard = runtime_handle.enter();
+    let result = runtime_handle
+        .block_on(async {
+            let task_stream = futures::stream::iter(uris.iter().map(|uri| {
+                let owned_string = uri.to_string();
+                let owned_client = io_client.clone();
+                let owned_io_stats = io_stats.clone();
+                let owned_parse_options = parse_options.clone();
+                tokio::spawn(async move {
+                    read_json_schema_single(
+                        &owned_string,
+                        owned_parse_options.unwrap_or_default(),
+                        max_bytes,
+                        owned_client,
+                        owned_io_stats,
+                    )
+                    .await
+                })
+            }));
+            task_stream
+                .buffered(num_parallel_tasks)
+                .try_collect::<Vec<_>>()
+                .await
+        })
+        .context(super::JoinSnafu {})?;
+    result.into_iter().collect::<DaftResult<Vec<_>>>()
+}
+
+pub(crate) async fn read_json_schema_single(
+    uri: &str,
+    _: JsonParseOptions,
+    max_bytes: Option<usize>,
+    io_client: Arc<IOClient>,
+    io_stats: Option<IOStatsRef>,
+) -> DaftResult<Schema> {
+    let (reader, max_bytes): (Box<dyn AsyncBufRead + Unpin + Send>, Option<usize>) = match io_client
+        .single_url_get(uri.to_string(), None, io_stats)
+        .await?
+    {
+        GetResult::File(file) => (
+            Box::new(BufReader::new(File::open(file.path).await?)),
+            max_bytes,
+        ),
+        GetResult::Stream(stream, size, _) => (
+            Box::new(StreamReader::new(stream)),
+            // Truncate max_bytes to size if both are set.
+            max_bytes.map(|m| size.map(|s| m.min(s)).unwrap_or(m)),
+        ),
+    };
+    // If file is compressed, wrap stream in decoding stream.
+    let reader: Box<dyn AsyncBufRead + Unpin + Send> = match CompressionCodec::from_uri(uri) {
+        Some(compression) => Box::new(tokio::io::BufReader::new(compression.to_decoder(reader))),
+        None => reader,
+    };
+    let arrow_schema = infer_schema(reader, None, max_bytes).await?;
+    let schema = Schema::try_from(&arrow_schema)?;
+    Ok(schema)
+}
+
+async fn infer_schema<R>(
+    reader: R,
+    max_rows: Option<usize>,
+    max_bytes: Option<usize>,
+) -> DaftResult<arrow2::datatypes::Schema>
+where
+    R: tokio::io::AsyncBufRead + Unpin + Send,
+{
+    let max_records = max_rows.unwrap_or(usize::MAX);
+    let max_bytes = max_bytes.unwrap_or(usize::MAX);
+    let mut total_bytes = 0;
+    // Stream of unparsed JSON string records.
+    let line_stream = tokio_stream::wrappers::LinesStream::new(reader.lines());
+    let mut schema_stream = line_stream
+        .try_take_while(|record| {
+            // Terminate scan if we've exceeded our max_bytes threshold with the last-read line.
+            if total_bytes >= max_bytes {
+                futures::future::ready(Ok(false))
+            } else {
+                total_bytes += record.len();
+                futures::future::ready(Ok(true))
+            }
+        })
+        .take(max_records)
+        .map(|record| {
+            let record = record.context(StdIOSnafu)?;
+
+            // Parse record into a JSON Value, then infer the schema.
+            let parsed_record =
+                parse(record.as_bytes()).map_err(|e| super::Error::JsonDeserializationError {
+                    string: e.to_string(),
+                })?;
+            infer_records_schema(&parsed_record).context(ArrowSnafu)
+        });
+    // Collect all infered dtypes for each column.
+    let mut column_types: IndexMap<String, HashSet<arrow2::datatypes::DataType>> = IndexMap::new();
+    while let Some(schema) = schema_stream.next().await.transpose()? {
+        for field in schema.fields {
+            // Get-and-mutate-or-insert.
+            match column_types.entry(field.name) {
+                indexmap::map::Entry::Occupied(mut v) => {
+                    v.get_mut().insert(field.data_type);
+                }
+                indexmap::map::Entry::Vacant(v) => {
+                    let mut a = HashSet::new();
+                    a.insert(field.data_type);
+                    v.insert(a);
+                }
+            }
+        }
+    }
+    // Convert column types map to dtype-consolidated column fields.
+    let fields = column_types_map_to_fields(column_types);
+    Ok(fields.into())
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use common_error::DaftResult;
+    use daft_core::{
+        datatypes::{Field, TimeUnit},
+        schema::Schema,
+        DataType,
+    };
+    use daft_io::{IOClient, IOConfig};
+    use rstest::rstest;
+
+    use super::read_json_schema;
+
+    #[rstest]
+    fn test_json_schema_local(
+        #[values(
+            // Uncompressed
+            None,
+            // brotli
+            Some("br"),
+            // bzip2
+            Some("bz2"),
+            // TODO(Clark): Add deflate compressed JSON file to test data fixtures.
+            // // deflate
+            // Some("deflate"),
+            // gzip
+            Some("gz"),
+            // lzma
+            Some("lzma"),
+            // xz
+            Some("xz"),
+            // zlib
+            Some("zl"),
+            // zstd
+            Some("zst"),
+        )]
+        compression: Option<&str>,
+    ) -> DaftResult<()> {
+        let file = format!(
+            "{}/test/iris_tiny.jsonl{}",
+            env!("CARGO_MANIFEST_DIR"),
+            compression.map_or("".to_string(), |ext| format!(".{}", ext))
+        );
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let schema = read_json_schema(file.as_ref(), None, None, io_client.clone(), None)?;
+        assert_eq!(
+            schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?,
+        );
+
+        Ok(())
+    }
+
+    #[rstest]
+    fn test_json_schema_local_dtypes() -> DaftResult<()> {
+        let file = format!("{}/test/dtypes.jsonl", env!("CARGO_MANIFEST_DIR"),);
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let schema = read_json_schema(file.as_ref(), None, None, io_client, None)?;
+        assert_eq!(
+            schema,
+            Schema::new(vec![
+                Field::new("int", DataType::Int64),
+                Field::new("float", DataType::Float64),
+                Field::new("bool", DataType::Boolean),
+                Field::new("str", DataType::Utf8),
+                Field::new("null", DataType::Null),
+                Field::new("date", DataType::Date),
+                // TODO(Clark): Add coverage for time parsing once we add support for representing time series in Daft.
+                // // Time unit should be coarest granularity found in file, i.e. seconds.
+                // Field::new("time", DataType::Time(TimeUnit::Nanoseconds)),
+                // Time unit should be coarsest granularity found in file, i.e. seconds due to naive date inclusion.
+                Field::new(
+                    "naive_timestamp",
+                    DataType::Timestamp(TimeUnit::Seconds, None)
+                ),
+                // Timezone should be UTC due to field having multiple different timezones across records.
+                // Time unit should be coarsest granularity found in file, i.e. milliseconds.
+                Field::new(
+                    "timestamp",
+                    DataType::Timestamp(TimeUnit::Milliseconds, Some("Z".to_string()))
+                ),
+                Field::new("list", DataType::List(Box::new(DataType::Int64))),
+                Field::new(
+                    "obj",
+                    DataType::Struct(vec![
+                        Field::new("a", DataType::Int64),
+                        Field::new("b", DataType::Boolean)
+                    ])
+                ),
+                Field::new(
+                    "nested_list",
+                    DataType::List(Box::new(DataType::List(Box::new(DataType::Struct(vec![
+                        Field::new("a", DataType::Utf8),
+                    ])))))
+                ),
+                Field::new(
+                    "nested_obj",
+                    DataType::Struct(vec![
+                        Field::new(
+                            "obj",
+                            DataType::Struct(vec![Field::new("a", DataType::Int64)])
+                        ),
+                        Field::new("list", DataType::List(Box::new(DataType::Int64))),
+                    ])
+                ),
+            ])?
+            .into(),
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_schema_local_nulls() -> DaftResult<()> {
+        let file = format!("{}/test/iris_tiny_nulls.jsonl", env!("CARGO_MANIFEST_DIR"),);
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let schema = read_json_schema(file.as_ref(), None, None, io_client.clone(), None)?;
+        assert_eq!(
+            schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?,
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_schema_local_conflicting_types_utf8_fallback() -> DaftResult<()> {
+        let file = format!(
+            "{}/test/iris_tiny_conflicting_dtypes.jsonl",
+            env!("CARGO_MANIFEST_DIR"),
+        );
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let schema = read_json_schema(file.as_ref(), None, None, io_client.clone(), None)?;
+        assert_eq!(
+            schema,
+            Schema::new(vec![
+                // All conflicting dtypes fall back to UTF8.
+                Field::new("sepalLength", DataType::Utf8),
+                Field::new("sepalWidth", DataType::Utf8),
+                Field::new("petalLength", DataType::Utf8),
+                // Float + Int => Float, non-conflicting
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?,
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_json_schema_local_max_bytes() -> DaftResult<()> {
+        let file = format!("{}/test/iris_tiny.jsonl", env!("CARGO_MANIFEST_DIR"),);
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let schema = read_json_schema(file.as_ref(), None, Some(100), io_client.clone(), None)?;
+        assert_eq!(
+            schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?,
+        );
+
+        Ok(())
+    }
+
+    #[rstest]
+    fn test_json_schema_s3(
+        #[values(
+                // Uncompressed
+                None,
+                // brotli
+                Some("br"),
+                // bzip2
+                Some("bz2"),
+                // TODO(Clark): Add deflate compressed JSON file to test data fixtures.
+                // deflate
+                // Some("deflate"),
+                // gzip
+                Some("gz"),
+                // lzma
+                Some("lzma"),
+                // xz
+                Some("xz"),
+                // zlib
+                Some("zl"),
+                // zstd
+                Some("zst"),
+            )]
+        compression: Option<&str>,
+    ) -> DaftResult<()> {
+        let file = format!(
+            "s3://daft-public-data/test_fixtures/json-dev/iris_tiny.jsonl{}",
+            compression.map_or("".to_string(), |ext| format!(".{}", ext))
+        );
+
+        let mut io_config = IOConfig::default();
+        io_config.s3.anonymous = true;
+        let io_client = Arc::new(IOClient::new(io_config.into())?);
+
+        let schema = read_json_schema(file.as_ref(), None, None, io_client.clone(), None)?;
+        assert_eq!(
+            schema,
+            Schema::new(vec![
+                Field::new("sepalLength", DataType::Float64),
+                Field::new("sepalWidth", DataType::Float64),
+                Field::new("petalLength", DataType::Float64),
+                Field::new("petalWidth", DataType::Float64),
+                Field::new("species", DataType::Utf8),
+            ])?
+        );
+
+        Ok(())
+    }
+}
diff --git a/src/daft-json/test/dtypes.jsonl b/src/daft-json/test/dtypes.jsonl
new file mode 100644
index 0000000000..3986d97356
--- /dev/null
+++ b/src/daft-json/test/dtypes.jsonl
@@ -0,0 +1,4 @@
+{"int": 1, "float": 2.3, "bool": false, "str": "foo", "null": null, "date": "2023-11-29", "naive_timestamp": "2023-11-29T06:31:52.342", "timestamp": "2023-11-29T06:31:52.342567Z", "list": [1, 2, 3], "obj": {"a": 1, "b": false}, "nested_list": [[{"a": "foo"}]], "nested_obj": {"obj": {"a": 4}, "list": [1, 2]}}
+{"int": 2, "float": 3.3, "bool": true, "str": "bar", "null": null, "date": "2023/11/28", "naive_timestamp": "2023-11-29", "timestamp": "2023-11-29T06:31:52.342+07:00", "list": [4, 5], "obj": {"a": 2, "b": false}, "nested_list": [[{"a": "bar"}]], "nested_obj": {"obj": {"a": 6}, "list": [3, 4]}}
+{"int": null, "float": null, "bool": null, "str": null, "null": null, "date": null, "naive_timestamp": null, "timestamp": null, "list": null, "obj": null, "nested_list": null, "nested_obj": null}
+{"int": 3, "float": 4.3, "bool": false, "str": "baz", "null": null, "date": "2023-11-27", "naive_timestamp": "2023-11-29 06:31:52.342567", "timestamp": "2023-11-29 06:31:52.342-07:00", "list": [6, 7, null, 9], "obj": {"a": null, "b": false}, "nested_list": [[{"a": null}]], "nested_obj": {"obj": {"a": null}, "list": [5, null]}}
diff --git a/src/daft-json/test/iris_tiny.jsonl b/src/daft-json/test/iris_tiny.jsonl
new file mode 100644
index 0000000000..99926cc27d
--- /dev/null
+++ b/src/daft-json/test/iris_tiny.jsonl
@@ -0,0 +1,20 @@
+{"sepalLength": 5.1, "sepalWidth": 3.5, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.9, "sepalWidth": 3.0, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.7, "sepalWidth": 3.2, "petalLength": 1.3, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.6, "sepalWidth": 3.1, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 5.0, "sepalWidth": 3.6, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 5.4, "sepalWidth": 3.9, "petalLength": 1.7, "petalWidth": 0.4, "species": "setosa"}
+{"sepalLength": 4.6, "sepalWidth": 3.4, "petalLength": 1.4, "petalWidth": 0.3, "species": "setosa"}
+{"sepalLength": 5.0, "sepalWidth": 3.4, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.4, "sepalWidth": 2.9, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.9, "sepalWidth": 3.1, "petalLength": 1.5, "petalWidth": 0.1, "species": "setosa"}
+{"sepalLength": 5.4, "sepalWidth": 3.7, "petalLength": 1.5, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.8, "sepalWidth": 3.4, "petalLength": 1.6, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.8, "sepalWidth": 3.0, "petalLength": 1.4, "petalWidth": 0.1, "species": "setosa"}
+{"sepalLength": 4.3, "sepalWidth": 3.0, "petalLength": 1.1, "petalWidth": 0.1, "species": "setosa"}
+{"sepalLength": 5.8, "sepalWidth": 4.0, "petalLength": 1.2, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 5.7, "sepalWidth": 4.4, "petalLength": 1.5, "petalWidth": 0.4, "species": "setosa"}
+{"sepalLength": 5.4, "sepalWidth": 3.9, "petalLength": 1.3, "petalWidth": 0.4, "species": "setosa"}
+{"sepalLength": 5.1, "sepalWidth": 3.5, "petalLength": 1.4, "petalWidth": 0.3, "species": "setosa"}
+{"sepalLength": 5.7, "sepalWidth": 3.8, "petalLength": 1.7, "petalWidth": 0.3, "species": "setosa"}
+{"sepalLength": 5.1, "sepalWidth": 3.8, "petalLength": 1.5, "petalWidth": 0.3, "species": "setosa"}
diff --git a/src/daft-json/test/iris_tiny.jsonl.br b/src/daft-json/test/iris_tiny.jsonl.br
new file mode 100644
index 0000000000000000000000000000000000000000..9bf69cb456f7a4633608a4a164966c9d8eb3f86b
GIT binary patch
literal 176
zcmV;h08jt1a6SNMRbU${G(IWb5();=Gyl^PL}3ryaH&5e(IMV#Y<+j{w}nm+BxvdM
zg-ew2kbe{YX8GIh=HIwP+7kL=ZJh?jF@*<p!|t<eQbY`sby&P_fmpGM<Ln(9<F^uX
z2_1~;z4mXxbKb-fO!)b=Mbv47m6AMhY107uNEwY1AZ<jF0s3#tA;D!Dc&q4vqthl(
effxl$l5VVfK*uy9;8q-xvH@yk5M5aaP>ccXoKjr?

literal 0
HcmV?d00001

diff --git a/src/daft-json/test/iris_tiny.jsonl.bz2 b/src/daft-json/test/iris_tiny.jsonl.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..1fa7f678d7cf43fe915d9857b12cd09918682b60
GIT binary patch
literal 228
zcmV<A02}{8T4*^jL0KkKS%LW#h5!TEUw{A*Pz8VR00aPlF6GP$FacU12~0{%)eRVF
zF)=V>35k;dFaQN9NFV@c28ZgJqj^_}FqjkcpCLGjq7YSA=uz@*)tlcA6MN$Ig4pu4
ztj;kkEfje;^w7^@#o&3onQmaUMv1uOQIQxC9cNN9gj)zTDxKBUmQq$WJ0QmLTb!3^
zGBB7~m5{_`Fl8{Dh`EIFCW)e9Fr3NyCdB#^fSZZ9O^N&`5S+yPC%8_bH<JY7PQf-N
eiLgzC{w9f(W@Q@_RMrjz{}*yaI8cy*`4xtcu3?q{

literal 0
HcmV?d00001

diff --git a/src/daft-json/test/iris_tiny.jsonl.gz b/src/daft-json/test/iris_tiny.jsonl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..ada0d0117e63d67dc4dea1ccc9d7761e7a15a896
GIT binary patch
literal 258
zcmV+d0sa0TiwFo@0A*zW18H(;b6<36Zh0<hb8l{J0Iiil3c@fDMfaYfbS<GtCu!>$
z+`BFsv>;k&cg4FKQwow9aY(YvkmND{XMUD)vSWWg+Q)Uim4`wLxi8k*X}I`OEwn3*
zcK+HX1#Go#TP4)z<Y>=>o!r`8GC!xjeC?Ki0iwf<DmSCS3`H>N6o$bJNicLgLwjKY
zhRLgy_J;#TM=%=9fE2BO*GHYA6~##K1F%+#)GGIUFyEC-@w~~cRm=0kWM(k4gsT;-
zFYrG@ruc#S0Ft5NwbD2ffjb}EIcH}Hk-L0G{2#^765{`8d6~#(5PeV#!jB(}H=Oj3
Iw9f|s06-FX=>Px#

literal 0
HcmV?d00001

diff --git a/src/daft-json/test/iris_tiny.jsonl.lzma b/src/daft-json/test/iris_tiny.jsonl.lzma
new file mode 100644
index 0000000000000000000000000000000000000000..33b43e5c79aa79fd6835d3cfe3c68c1505b2c696
GIT binary patch
literal 202
zcmV;*05$(z004jh|NsC0|NsC006mC`W|Tp$SS!6#zlxScM|mfVu8}K-a{kly5!5O+
z)PWC@8z=cM+;Pl`2AX?l!oV~Ui|;dZJ+}z&${;$h55lu2%reAAi?nzj2CtsyMZ*HT
z>XEY=`8JBsSqCvoV!tO&f|ADCYg;k?M=n*L1`?BGkiv<uG4BXZ(RXZ=uvs@nCcBS<
z9DD>rt?2T<UlN_ZgFlVe%k~?oiXDwFJYAWbJ-2%xDgeb?qK2Zacd0~V<2<#MAN>k-
E%AG}GH2?qr

literal 0
HcmV?d00001

diff --git a/src/daft-json/test/iris_tiny.jsonl.xz b/src/daft-json/test/iris_tiny.jsonl.xz
new file mode 100644
index 0000000000000000000000000000000000000000..43a5951106efdaad27ac8dd4b41ddb0a05915f72
GIT binary patch
literal 248
zcmV<U00;m5H+ooF000E$*0e?f03iVu0001VFXf})2hISuT>w3Zie{8Su2?I*Q@@Iq
zMMrrjjINO@hI0PX_7T)7Hq?O+k{c)aFWhm=iUyi{Xu`lW5sU9LbUn8S@5&%Lun)qs
zC(JU$MvJs~9|o_U=S9N;yy}s&8u>Pg&{+pDOJct#PJ)ug+G|@e{zooVp9T_>WRSv%
zurcolPtkX5l(1PhMJBtCf*gDVL#^oYzh4razJouF*UR=BsfrzqFFakDoISUDASwXG
yT%v}et#_$JWZl0s)&KwthAr*QooY1z0n-7`4*&pxD(z3P#Ao{g000001X)__9c*3z

literal 0
HcmV?d00001

diff --git a/src/daft-json/test/iris_tiny.jsonl.zl b/src/daft-json/test/iris_tiny.jsonl.zl
new file mode 100644
index 0000000000000000000000000000000000000000..0c4c91f92dc434aa1d7aef8ae5e9a604d44f3692
GIT binary patch
literal 230
zcmV<C02%*yoUN5X3c@fDMfaYfbS<GtCu!>$+`BFsv>;k&cg4FKQwow9aY(YvkmND{
zXMUD)vSWWg+Q)Uim4`wLxi8k*X}I`OEwn3*cK+HX1#Go#TP4)z<Y>=>o!r`8GC!xj
zeC?Ki0iwf<DmSCS3`H>N6o$bJNicLgLwjKYhRLgy_J;#TM=%=9fE2BO*GHYA6~##K
z1F%+#)GGIUFyEC-@w~~cRm=0kWM(k4gsT;-FYrG@ruc#S0Ft5NwbD2ffjb}EIcH}H
gk-L0G{2#^765{`8d6~#(5PeV#!jB(}HzcKC^1GCAUH||9

literal 0
HcmV?d00001

diff --git a/src/daft-json/test/iris_tiny.jsonl.zst b/src/daft-json/test/iris_tiny.jsonl.zst
new file mode 100644
index 0000000000000000000000000000000000000000..2ecfeb5fde2b0754aaf697b9f7833eb1816d4fc4
GIT binary patch
literal 228
zcmV<A02}`(wJ-f-&jz&y0P@Bb8i2VuYV~jQ^`Au0kdfo2CM2aKL=ut%ndk}nBUPzd
zm+vD#^EglPfmY6C<g0D9?DS_+q|QPE7MHZdf?Qi=ungfb=CbUJaa4jLFh=4Ha)zaV
z*uWuS+fETqsPWvPDeW7eqokMv5`bVJ0*HWt2pR?>pmdEh!hi45^!W`vp_&qXSq{pn
z4N%c7k|GmcE^!f30iVqa5#+zLkVWuS>Dtr4roAoH8uff1$ZT+YapO4T8AOiXDW37(
e3MBs70t#v1J7Yn{{WvKL)&5VP?^gkNaC_vWfNtOb

literal 0
HcmV?d00001

diff --git a/src/daft-json/test/iris_tiny_all_null_column.jsonl b/src/daft-json/test/iris_tiny_all_null_column.jsonl
new file mode 100644
index 0000000000..3e73b3ef6a
--- /dev/null
+++ b/src/daft-json/test/iris_tiny_all_null_column.jsonl
@@ -0,0 +1,6 @@
+{"sepalLength": 5.1, "sepalWidth": 3.5, "petalLength": null, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.9, "sepalWidth": 3.0, "petalLength": null, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.7, "sepalWidth": 3.2, "petalLength": null, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.6, "sepalWidth": 3.1, "petalLength": null, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 5.0, "sepalWidth": 3.6, "petalLength": null, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 5.4, "sepalWidth": 3.9, "petalLength": null, "petalWidth": 0.4, "species": "setosa"}
diff --git a/src/daft-json/test/iris_tiny_conflicting_dtypes.jsonl b/src/daft-json/test/iris_tiny_conflicting_dtypes.jsonl
new file mode 100644
index 0000000000..351361be62
--- /dev/null
+++ b/src/daft-json/test/iris_tiny_conflicting_dtypes.jsonl
@@ -0,0 +1,2 @@
+{"sepalLength": 5.1, "sepalWidth": false, "petalLength": 3, "petalWidth": 3, "species": "setosa"}
+{"sepalLength": "foo", "sepalWidth": 3.0, "petalLength": "bar", "petalWidth": 0.2, "species": false}
diff --git a/src/daft-json/test/iris_tiny_nulls.jsonl b/src/daft-json/test/iris_tiny_nulls.jsonl
new file mode 100644
index 0000000000..71dd3eb231
--- /dev/null
+++ b/src/daft-json/test/iris_tiny_nulls.jsonl
@@ -0,0 +1,6 @@
+{"sepalLength": null, "sepalWidth": 3.5, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.9, "sepalWidth": null, "petalLength": 1.4, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.7, "sepalWidth": 3.2, "petalLength": null, "petalWidth": 0.2, "species": "setosa"}
+{"sepalLength": 4.6, "sepalWidth": 3.1, "petalLength": 1.5, "petalWidth": null, "species": "setosa"}
+{"sepalLength": 5.0, "sepalWidth": 3.6, "petalLength": 1.4, "petalWidth": 0.2, "species": null}
+{"sepalLength": null, "sepalWidth": null, "petalLength": null, "petalWidth": null, "species": null}
diff --git a/src/daft-micropartition/Cargo.toml b/src/daft-micropartition/Cargo.toml
index f74c46a53c..5c7e026dc2 100644
--- a/src/daft-micropartition/Cargo.toml
+++ b/src/daft-micropartition/Cargo.toml
@@ -6,6 +6,7 @@ daft-core = {path = "../daft-core", default-features = false}
 daft-csv = {path = "../daft-csv", default-features = false}
 daft-dsl = {path = "../daft-dsl", default-features = false}
 daft-io = {path = "../daft-io", default-features = false}
+daft-json = {path = "../daft-json", default-features = false}
 daft-parquet = {path = "../daft-parquet", default-features = false}
 daft-scan = {path = "../daft-scan", default-features = false}
 daft-stats = {path = "../daft-stats", default-features = false}
diff --git a/src/daft-micropartition/src/micropartition.rs b/src/daft-micropartition/src/micropartition.rs
index b272023d46..c662bcd6f7 100644
--- a/src/daft-micropartition/src/micropartition.rs
+++ b/src/daft-micropartition/src/micropartition.rs
@@ -8,6 +8,7 @@ use common_error::DaftResult;
 use daft_core::schema::{Schema, SchemaRef};
 
 use daft_csv::{CsvConvertOptions, CsvParseOptions, CsvReadOptions};
+use daft_json::{JsonConvertOptions, JsonParseOptions, JsonReadOptions};
 use daft_parquet::read::{
     read_parquet_bulk, read_parquet_metadata_bulk, ParquetSchemaInferenceOptions,
 };
@@ -131,7 +132,7 @@ fn materialize_scan_task(
                 // ****************
                 // Native CSV Reads
                 // ****************
-                FileFormatConfig::Csv(cfg @ CsvSourceConfig { .. }) => {
+                FileFormatConfig::Csv(cfg) => {
                     let col_names = if !cfg.has_headers {
                         Some(
                             cast_to_schema
@@ -182,8 +183,30 @@ fn materialize_scan_task(
                 // ****************
                 // Native JSON Reads
                 // ****************
-                FileFormatConfig::Json(_) => {
-                    todo!("TODO: Implement MicroPartition native reads for JSON.");
+                FileFormatConfig::Json(cfg) => {
+                    let convert_options = JsonConvertOptions::new_internal(
+                        scan_task.pushdowns.limit,
+                        column_names
+                            .as_ref()
+                            .map(|cols| cols.iter().map(|col| col.to_string()).collect()),
+                        None,
+                    );
+                    let parse_options = JsonParseOptions::new_internal();
+                    let read_options =
+                        JsonReadOptions::new_internal(cfg.buffer_size, cfg.chunk_size);
+                    let uris = urls.collect::<Vec<_>>();
+                    daft_json::read_json_bulk(
+                        uris.as_slice(),
+                        Some(convert_options),
+                        Some(parse_options),
+                        Some(read_options),
+                        io_client,
+                        io_stats,
+                        native_storage_config.multithreaded_io,
+                        None,
+                        8,
+                    )
+                    .context(DaftCoreComputeSnafu)?
                 }
             }
         }
@@ -628,6 +651,55 @@ pub(crate) fn read_csv_into_micropartition(
     }
 }
 
+pub(crate) fn read_json_into_micropartition(
+    uris: &[&str],
+    convert_options: Option<JsonConvertOptions>,
+    parse_options: Option<JsonParseOptions>,
+    read_options: Option<JsonReadOptions>,
+    io_config: Arc<IOConfig>,
+    multithreaded_io: bool,
+    io_stats: Option<IOStatsRef>,
+) -> DaftResult<MicroPartition> {
+    let io_client = daft_io::get_io_client(multithreaded_io, io_config.clone())?;
+
+    match uris {
+        [] => Ok(MicroPartition::empty(None)),
+        uris => {
+            // Perform a bulk read of URIs, materializing a table per URI.
+            let tables = daft_json::read_json_bulk(
+                uris,
+                convert_options,
+                parse_options,
+                read_options,
+                io_client,
+                io_stats,
+                multithreaded_io,
+                None,
+                8,
+            )
+            .context(DaftCoreComputeSnafu)?;
+
+            // Union all schemas and cast all tables to the same schema
+            let unioned_schema = tables
+                .iter()
+                .map(|tbl| tbl.schema.clone())
+                .try_reduce(|s1, s2| s1.union(s2.as_ref()).map(Arc::new))?
+                .unwrap();
+            let tables = tables
+                .into_iter()
+                .map(|tbl| tbl.cast_to_schema(&unioned_schema))
+                .collect::<DaftResult<Vec<_>>>()?;
+
+            // Construct MicroPartition from tables and unioned schema
+            Ok(MicroPartition::new_loaded(
+                unioned_schema.clone(),
+                Arc::new(tables),
+                None,
+            ))
+        }
+    }
+}
+
 #[allow(clippy::too_many_arguments)]
 pub(crate) fn read_parquet_into_micropartition(
     uris: &[&str],
diff --git a/src/daft-micropartition/src/python.rs b/src/daft-micropartition/src/python.rs
index f2a6796ffc..7e398107c2 100644
--- a/src/daft-micropartition/src/python.rs
+++ b/src/daft-micropartition/src/python.rs
@@ -12,6 +12,7 @@ use daft_core::{
 use daft_csv::{CsvConvertOptions, CsvParseOptions, CsvReadOptions};
 use daft_dsl::python::PyExpr;
 use daft_io::{python::IOConfig, IOStatsContext};
+use daft_json::{JsonConvertOptions, JsonParseOptions, JsonReadOptions};
 use daft_parquet::read::ParquetSchemaInferenceOptions;
 use daft_scan::{python::pylib::PyScanTask, storage_config::PyStorageConfig, ScanTask};
 use daft_stats::TableStatistics;
@@ -370,6 +371,33 @@ impl PyMicroPartition {
         Ok(mp.into())
     }
 
+    #[staticmethod]
+    pub fn read_json_native(
+        py: Python,
+        uri: &str,
+        convert_options: Option<JsonConvertOptions>,
+        parse_options: Option<JsonParseOptions>,
+        read_options: Option<JsonReadOptions>,
+        io_config: Option<IOConfig>,
+        multithreaded_io: Option<bool>,
+    ) -> PyResult<Self> {
+        let mp = py.allow_threads(|| {
+            let io_stats = IOStatsContext::new(format!("read_json: for uri {uri}"));
+            let io_config = io_config.unwrap_or_default().config.into();
+
+            crate::micropartition::read_json_into_micropartition(
+                [uri].as_ref(),
+                convert_options,
+                parse_options,
+                read_options,
+                io_config,
+                multithreaded_io.unwrap_or(true),
+                Some(io_stats),
+            )
+        })?;
+        Ok(mp.into())
+    }
+
     #[staticmethod]
     pub fn read_csv(
         py: Python,
diff --git a/src/daft-plan/src/optimization/optimizer.rs b/src/daft-plan/src/optimization/optimizer.rs
index d020c0eb99..72670e77b1 100644
--- a/src/daft-plan/src/optimization/optimizer.rs
+++ b/src/daft-plan/src/optimization/optimizer.rs
@@ -515,14 +515,16 @@ mod tests {
             ],
             OptimizerConfig::new(20),
         );
+        let fields = vec![Field::new("a", DataType::Int64)];
         let proj_exprs = vec![
             col("a") + lit(1),
             (col("a") + lit(2)).alias("b"),
             (col("a") + lit(3)).alias("c"),
         ];
-        let plan = dummy_scan_node(vec![Field::new("a", DataType::Int64)])
+        let filter_predicate = col("a").lt(&lit(2));
+        let plan = dummy_scan_node(fields.clone())
             .project(proj_exprs, Default::default())?
-            .filter(col("a").lt(&lit(2)))?
+            .filter(filter_predicate)?
             .build();
         let mut pass_count = 0;
         let mut did_transform = false;
@@ -536,7 +538,7 @@ mod tests {
         let expected = "\
         Filter: [[[col(a) < lit(2)] | lit(false)] | lit(false)] & lit(true)\
         \n  Project: col(a) + lit(3) AS c, col(a) + lit(1), col(a) + lit(2) AS b\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64)";
         assert_eq!(opt_plan.repr_indent(), expected);
         Ok(())
     }
diff --git a/src/daft-plan/src/optimization/rules/drop_repartition.rs b/src/daft-plan/src/optimization/rules/drop_repartition.rs
index 02db0ffb1f..0593151c07 100644
--- a/src/daft-plan/src/optimization/rules/drop_repartition.rs
+++ b/src/daft-plan/src/optimization/rules/drop_repartition.rs
@@ -98,7 +98,7 @@ mod tests {
         .build();
         let expected = "\
         Repartition: Scheme = Hash, Number of partitions = 5, Partition by = col(a)\
-        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
+        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
diff --git a/src/daft-plan/src/optimization/rules/push_down_filter.rs b/src/daft-plan/src/optimization/rules/push_down_filter.rs
index 0bd738e574..dbe7c72102 100644
--- a/src/daft-plan/src/optimization/rules/push_down_filter.rs
+++ b/src/daft-plan/src/optimization/rules/push_down_filter.rs
@@ -261,7 +261,7 @@ mod tests {
         .build();
         let expected = "\
         Filter: [col(b) == lit(\"foo\")] & [col(a) < lit(2)]\
-        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
+        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -279,7 +279,7 @@ mod tests {
         let expected = "\
         Project: col(a)\
         \n  Filter: col(a) < lit(2)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -297,7 +297,7 @@ mod tests {
         let expected = "\
         Project: col(a), col(b)\
         \n  Filter: [col(a) < lit(2)] & [col(b) == lit(\"foo\")]\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -317,7 +317,7 @@ mod tests {
         let expected = "\
         Filter: col(a) < lit(2)\
         \n  Project: col(a) + lit(1)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -338,7 +338,7 @@ mod tests {
         let expected = "\
         Project: col(a) + lit(1)\
         \n  Filter: [col(a) + lit(1)] < lit(2)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -356,7 +356,7 @@ mod tests {
         let expected = "\
         Sort: Sort by = (col(a), descending)\
         \n  Filter: col(a) < lit(2)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
         // TODO(Clark): For tests in which we only care about reordering of operators, maybe switch to a form that leverages the single-node display?
         // let expected = format!("{sort}\n  {filter}\n    {source}");
         assert_optimized_plan_eq(plan, expected)?;
@@ -376,7 +376,7 @@ mod tests {
         let expected = "\
         Repartition: Scheme = Hash, Number of partitions = 1, Partition by = col(a)\
         \n  Filter: col(a) < lit(2)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -395,9 +395,9 @@ mod tests {
         let expected = "\
         Concat\
         \n  Filter: col(a) < lit(2)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)\
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)\
         \n  Filter: col(a) < lit(2)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -423,8 +423,8 @@ mod tests {
         let expected = "\
         Join: Type = Inner, On = col(b), Output schema = a (Int64), b (Utf8), c (Float64)\
         \n  Filter: col(a) < lit(2)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)\
-        \n  Source: Json, File paths = [/foo], File schema = b (Utf8), c (Float64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = b (Utf8), c (Float64)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)\
+        \n  Source: Json, File paths = [/foo], File schema = b (Utf8), c (Float64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = b (Utf8), c (Float64)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -449,9 +449,9 @@ mod tests {
         .build();
         let expected = "\
         Join: Type = Inner, On = col(b), Output schema = a (Int64), b (Utf8), c (Float64)\
-        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)\
+        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)\
         \n  Filter: col(c) < lit(2.0)\
-        \n    Source: Json, File paths = [/foo], File schema = b (Utf8), c (Float64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = b (Utf8), c (Float64)";
+        \n    Source: Json, File paths = [/foo], File schema = b (Utf8), c (Float64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = b (Utf8), c (Float64)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -475,9 +475,9 @@ mod tests {
         let expected = "\
         Join: Type = Inner, On = col(b), Output schema = a (Int64), b (Int64), c (Float64)\
         \n  Filter: col(b) < lit(2)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), c (Float64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Int64), c (Float64)\
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), c (Float64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Int64), c (Float64)\
         \n  Filter: col(b) < lit(2)\
-        \n    Source: Json, File paths = [/foo], File schema = b (Int64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = b (Int64)";
+        \n    Source: Json, File paths = [/foo], File schema = b (Int64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = b (Int64)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
diff --git a/src/daft-plan/src/optimization/rules/push_down_limit.rs b/src/daft-plan/src/optimization/rules/push_down_limit.rs
index 457dcbe830..a9706db4f1 100644
--- a/src/daft-plan/src/optimization/rules/push_down_limit.rs
+++ b/src/daft-plan/src/optimization/rules/push_down_limit.rs
@@ -87,6 +87,7 @@ mod tests {
     use common_error::DaftResult;
     use daft_core::{datatypes::Field, schema::Schema, DataType};
     use daft_dsl::col;
+    use daft_scan::Pushdowns;
     use std::sync::Arc;
 
     #[cfg(feature = "python")]
@@ -98,7 +99,7 @@ mod tests {
             rules::PushDownLimit,
             Optimizer,
         },
-        test::{dummy_scan_node, dummy_scan_node_with_limit},
+        test::{dummy_scan_node, dummy_scan_node_with_pushdowns},
         LogicalPlan, PartitionScheme,
     };
 
@@ -139,7 +140,7 @@ mod tests {
         .build();
         let expected = "\
         Limit: 5\
-        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Limit pushdown = 5, Output schema = a (Int64), b (Utf8)";
+        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Limit pushdown = 5, Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -149,18 +150,18 @@ mod tests {
     /// Limit-Source[existing_limit] -> Source[existing_limit]
     #[test]
     fn limit_does_not_push_into_external_source_if_smaller_limit() -> DaftResult<()> {
-        let plan = dummy_scan_node_with_limit(
+        let plan = dummy_scan_node_with_pushdowns(
             vec![
                 Field::new("a", DataType::Int64),
                 Field::new("b", DataType::Utf8),
             ],
-            Some(3),
+            Pushdowns::default().with_limit(Some(3)),
         )
         .limit(5, false)?
         .build();
         let expected = "\
         Limit: 5\
-        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Limit pushdown = 3, Output schema = a (Int64), b (Utf8)";
+        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Limit pushdown = 3, Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -170,18 +171,18 @@ mod tests {
     /// Limit-Source[existing_limit] -> Source[new_limit]
     #[test]
     fn limit_does_push_into_external_source_if_larger_limit() -> DaftResult<()> {
-        let plan = dummy_scan_node_with_limit(
+        let plan = dummy_scan_node_with_pushdowns(
             vec![
                 Field::new("a", DataType::Int64),
                 Field::new("b", DataType::Utf8),
             ],
-            Some(10),
+            Pushdowns::default().with_limit(Some(10)),
         )
         .limit(5, false)?
         .build();
         let expected = "\
         Limit: 5\
-        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Limit pushdown = 5, Output schema = a (Int64), b (Utf8)";
+        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Limit pushdown = 5, Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -199,7 +200,7 @@ mod tests {
             .build();
         let expected = "\
         Limit: 5\
-        \n . Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
+        \n . Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -219,7 +220,7 @@ mod tests {
         let expected = "\
         Repartition: Scheme = Hash, Number of partitions = 1, Partition by = col(a)\
         \n  Limit: 5\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Limit pushdown = 5, Output schema = a (Int64), b (Utf8)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Limit pushdown = 5, Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
@@ -239,7 +240,7 @@ mod tests {
         let expected = "\
         Project: col(a)\
         \n  Limit: 5\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Limit pushdown = 5, Output schema = a (Int64), b (Utf8)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Utf8), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Limit pushdown = 5, Output schema = a (Int64), b (Utf8)";
         assert_optimized_plan_eq(plan, expected)?;
         Ok(())
     }
diff --git a/src/daft-plan/src/optimization/rules/push_down_projection.rs b/src/daft-plan/src/optimization/rules/push_down_projection.rs
index edd04dab16..b8af4b559f 100644
--- a/src/daft-plan/src/optimization/rules/push_down_projection.rs
+++ b/src/daft-plan/src/optimization/rules/push_down_projection.rs
@@ -573,7 +573,7 @@ mod tests {
 
         let expected = "\
         Project: [col(a) + lit(1)] + lit(3), col(b) + lit(2), col(a) + lit(4)\
-        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Int64)";
+        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Int64)";
         assert_optimized_plan_eq(unoptimized, expected)?;
         Ok(())
     }
@@ -589,7 +589,7 @@ mod tests {
         .build();
 
         let expected = "\
-        Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Int64)";
+        Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Int64)";
         assert_optimized_plan_eq(unoptimized, expected)?;
 
         Ok(())
@@ -606,7 +606,7 @@ mod tests {
 
         let expected = "\
         Project: col(b), col(a)\
-        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Int64)";
+        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Int64)";
         assert_optimized_plan_eq(unoptimized, expected)?;
 
         Ok(())
@@ -624,7 +624,7 @@ mod tests {
 
         let expected = "\
         Project: col(b) + lit(3)\
-        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Projection pushdown = [b], Output schema = b (Int64)";
+        \n  Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Projection pushdown = [b], Output schema = b (Int64)";
         assert_optimized_plan_eq(unoptimized, expected)?;
 
         Ok(())
@@ -650,7 +650,7 @@ mod tests {
         let expected = "\
         Project: col(a), col(b), col(b) AS c\
         \n  Project: col(b) + lit(3), col(a)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Int64)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Output schema = a (Int64), b (Int64)";
         assert_optimized_plan_eq(unoptimized, expected)?;
 
         Ok(())
@@ -671,7 +671,7 @@ mod tests {
         let expected = "\
         Project: col(a)\
         \n  Aggregation: mean(col(a)), Group by = col(c), Output schema = c (Int64), a (Float64)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), c (Int64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Projection pushdown = [a, c], Output schema = a (Int64), c (Int64)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Int64), c (Int64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Projection pushdown = [a, c], Output schema = a (Int64), c (Int64)";
         assert_optimized_plan_eq(unoptimized, expected)?;
 
         Ok(())
@@ -692,7 +692,7 @@ mod tests {
         let expected = "\
         Project: col(a)\
         \n  Filter: col(b)\
-        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Boolean), c (Int64), Format-specific config = Json(JsonSourceConfig), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Projection pushdown = [a, b], Output schema = a (Int64), b (Boolean)";
+        \n    Source: Json, File paths = [/foo], File schema = a (Int64), b (Boolean), c (Int64), Format-specific config = Json(JsonSourceConfig { buffer_size: None, chunk_size: None }), Storage config = Native(NativeStorageConfig { io_config: None, multithreaded_io: true }), Projection pushdown = [a, b], Output schema = a (Int64), b (Boolean)";
         assert_optimized_plan_eq(unoptimized, expected)?;
 
         Ok(())
diff --git a/src/daft-plan/src/test/mod.rs b/src/daft-plan/src/test/mod.rs
index 11b69182df..1039f84ea1 100644
--- a/src/daft-plan/src/test/mod.rs
+++ b/src/daft-plan/src/test/mod.rs
@@ -3,31 +3,25 @@ use std::sync::Arc;
 use daft_core::{datatypes::Field, schema::Schema};
 use daft_scan::{file_format::FileFormatConfig, storage_config::StorageConfig, Pushdowns};
 
-use crate::{
-    builder::LogicalPlanBuilder, source_info::FileInfos, JsonSourceConfig, NativeStorageConfig,
-};
+use crate::{builder::LogicalPlanBuilder, source_info::FileInfos, NativeStorageConfig};
 
 /// Create a dummy scan node containing the provided fields in its schema.
 pub fn dummy_scan_node(fields: Vec<Field>) -> LogicalPlanBuilder {
-    let schema = Arc::new(Schema::new(fields).unwrap());
-    LogicalPlanBuilder::table_scan(
-        FileInfos::new_internal(vec!["/foo".to_string()], vec![None], vec![None]),
-        schema,
-        FileFormatConfig::Json(JsonSourceConfig {}).into(),
-        StorageConfig::Native(NativeStorageConfig::new_internal(true, None).into()).into(),
-    )
-    .unwrap()
+    dummy_scan_node_with_pushdowns(fields, Default::default())
 }
 
 /// Create a dummy scan node containing the provided fields in its schema and the provided limit.
-pub fn dummy_scan_node_with_limit(fields: Vec<Field>, limit: Option<usize>) -> LogicalPlanBuilder {
+pub fn dummy_scan_node_with_pushdowns(
+    fields: Vec<Field>,
+    pushdowns: Pushdowns,
+) -> LogicalPlanBuilder {
     let schema = Arc::new(Schema::new(fields).unwrap());
     LogicalPlanBuilder::table_scan_with_pushdowns(
         FileInfos::new_internal(vec!["/foo".to_string()], vec![None], vec![None]),
         schema,
-        FileFormatConfig::Json(JsonSourceConfig {}).into(),
+        FileFormatConfig::Json(Default::default()).into(),
         StorageConfig::Native(NativeStorageConfig::new_internal(true, None).into()).into(),
-        Pushdowns::new(None, None, limit),
+        pushdowns,
     )
     .unwrap()
 }
diff --git a/src/daft-scan/Cargo.toml b/src/daft-scan/Cargo.toml
index 91d6b8b6a4..c46dc61d9f 100644
--- a/src/daft-scan/Cargo.toml
+++ b/src/daft-scan/Cargo.toml
@@ -6,6 +6,7 @@ daft-core = {path = "../daft-core", default-features = false}
 daft-csv = {path = "../daft-csv", default-features = false}
 daft-dsl = {path = "../daft-dsl", default-features = false}
 daft-io = {path = "../daft-io", default-features = false}
+daft-json = {path = "../daft-json", default-features = false}
 daft-parquet = {path = "../daft-parquet", default-features = false}
 daft-stats = {path = "../daft-stats", default-features = false}
 daft-table = {path = "../daft-table", default-features = false}
diff --git a/src/daft-scan/src/file_format.rs b/src/daft-scan/src/file_format.rs
index 5a733bbf7f..c8e63e047a 100644
--- a/src/daft-scan/src/file_format.rs
+++ b/src/daft-scan/src/file_format.rs
@@ -157,15 +157,38 @@ impl_bincode_py_state_serialization!(CsvSourceConfig);
 /// Configuration for a JSON data source.
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Hash)]
 #[cfg_attr(feature = "python", pyclass(module = "daft.daft", get_all))]
-pub struct JsonSourceConfig {}
+pub struct JsonSourceConfig {
+    pub buffer_size: Option<usize>,
+    pub chunk_size: Option<usize>,
+}
+
+impl JsonSourceConfig {
+    pub fn new_internal(buffer_size: Option<usize>, chunk_size: Option<usize>) -> Self {
+        Self {
+            buffer_size,
+            chunk_size,
+        }
+    }
+}
+
+impl Default for JsonSourceConfig {
+    fn default() -> Self {
+        Self::new_internal(None, None)
+    }
+}
 
 #[cfg(feature = "python")]
 #[pymethods]
 impl JsonSourceConfig {
     /// Create a config for a JSON data source.
+    ///
+    /// # Arguments
+    ///
+    /// * `buffer_size` - Size of the buffer (in bytes) used by the streaming reader.
+    /// * `chunk_size` - Size of the chunks (in bytes) deserialized in parallel by the streaming reader.
     #[new]
-    fn new() -> Self {
-        Self {}
+    fn new(buffer_size: Option<usize>, chunk_size: Option<usize>) -> Self {
+        Self::new_internal(buffer_size, chunk_size)
     }
 }
 
diff --git a/src/daft-scan/src/glob.rs b/src/daft-scan/src/glob.rs
index 29e9224917..0175ef003c 100644
--- a/src/daft-scan/src/glob.rs
+++ b/src/daft-scan/src/glob.rs
@@ -8,12 +8,10 @@ use daft_io::{
 };
 use daft_parquet::read::ParquetSchemaInferenceOptions;
 use futures::{stream::BoxStream, StreamExt};
-use snafu::{ResultExt, Snafu};
-#[cfg(feature = "python")]
-use {crate::PyIOSnafu, daft_core::schema::Schema, pyo3::Python};
+use snafu::Snafu;
 
 use crate::{
-    file_format::{CsvSourceConfig, FileFormatConfig, JsonSourceConfig, ParquetSourceConfig},
+    file_format::{CsvSourceConfig, FileFormatConfig, ParquetSourceConfig},
     storage_config::StorageConfig,
     DataFileSource, PartitionField, Pushdowns, ScanOperator, ScanTask, ScanTaskRef,
 };
@@ -192,25 +190,13 @@ impl GlobScanOperator {
                 )?;
                 schema
             }
-            FileFormatConfig::Json(JsonSourceConfig {}) => {
-                // NOTE: Native JSON reads not yet implemented, so we have to delegate to Python here or implement
-                // a daft_json crate that gives us native JSON schema inference
-                match storage_config.as_ref() {
-                    StorageConfig::Native(_) => {
-                        todo!("Implement native JSON schema inference in a daft_json crate.")
-                    }
-                    #[cfg(feature = "python")]
-                    StorageConfig::Python(_) => Python::with_gil(|py| {
-                        crate::python::pylib::read_json_schema(
-                            py,
-                            first_filepath.as_str(),
-                            storage_config.clone().into(),
-                        )
-                        .and_then(|s| Ok(Schema::new(s.schema.fields.values().cloned().collect())?))
-                        .context(PyIOSnafu)
-                    })?,
-                }
-            }
+            FileFormatConfig::Json(_) => daft_json::schema::read_json_schema(
+                first_filepath.as_str(),
+                None,
+                None,
+                io_client,
+                Some(io_stats),
+            )?,
         };
 
         let schema = match schema_hint {
diff --git a/src/daft-scan/src/python.rs b/src/daft-scan/src/python.rs
index 05eb883a1f..a9b945f332 100644
--- a/src/daft-scan/src/python.rs
+++ b/src/daft-scan/src/python.rs
@@ -315,18 +315,6 @@ partitioning_keys:\n",
 
     impl_bincode_py_state_serialization!(PyScanTask);
 
-    pub(crate) fn read_json_schema(
-        py: Python,
-        uri: &str,
-        storage_config: PyStorageConfig,
-    ) -> PyResult<PySchema> {
-        py.import(pyo3::intern!(py, "daft.table.schema_inference"))?
-            .getattr(pyo3::intern!(py, "from_json"))?
-            .call1((uri, storage_config))?
-            .getattr(pyo3::intern!(py, "_schema"))?
-            .extract()
-    }
-
     #[pyclass(module = "daft.daft", name = "PartitionField", frozen)]
     #[derive(Debug, Clone, Serialize, Deserialize)]
     pub struct PyPartitionField(Arc<PartitionField>);
diff --git a/src/lib.rs b/src/lib.rs
index fce2431e7c..74aaec2393 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -56,6 +56,7 @@ pub mod pylib {
         daft_io::register_modules(_py, m)?;
         daft_parquet::register_modules(_py, m)?;
         daft_csv::register_modules(_py, m)?;
+        daft_json::register_modules(_py, m)?;
         daft_plan::register_modules(_py, m)?;
         daft_micropartition::register_modules(_py, m)?;
         daft_scan::register_modules(_py, m)?;
diff --git a/tests/dataframe/test_creation.py b/tests/dataframe/test_creation.py
index 14add620e4..a3542c9bdd 100644
--- a/tests/dataframe/test_creation.py
+++ b/tests/dataframe/test_creation.py
@@ -610,7 +610,8 @@ def test_create_dataframe_csv_schema_hints_ignore_random_hint(
 ###
 
 
-def test_create_dataframe_json(valid_data: list[dict[str, float]]) -> None:
+@pytest.mark.parametrize("use_native_downloader", [True, False])
+def test_create_dataframe_json(valid_data: list[dict[str, float]], use_native_downloader) -> None:
     with create_temp_filename() as fname:
         with open(fname, "w") as f:
             for data in valid_data:
@@ -618,7 +619,7 @@ def test_create_dataframe_json(valid_data: list[dict[str, float]]) -> None:
                 f.write("\n")
             f.flush()
 
-        df = daft.read_json(fname)
+        df = daft.read_json(fname, use_native_downloader=use_native_downloader)
         assert df.column_names == COL_NAMES
 
         pd_df = df.to_pandas()
@@ -626,7 +627,8 @@ def test_create_dataframe_json(valid_data: list[dict[str, float]]) -> None:
         assert len(pd_df) == len(valid_data)
 
 
-def test_create_dataframe_multiple_jsons(valid_data: list[dict[str, float]]) -> None:
+@pytest.mark.parametrize("use_native_downloader", [True, False])
+def test_create_dataframe_multiple_jsons(valid_data: list[dict[str, float]], use_native_downloader) -> None:
     with create_temp_filename() as f1name, create_temp_filename() as f2name:
         with open(f1name, "w") as f1, open(f2name, "w") as f2:
             for f in (f1, f2):
@@ -635,7 +637,7 @@ def test_create_dataframe_multiple_jsons(valid_data: list[dict[str, float]]) ->
                     f.write("\n")
                 f.flush()
 
-        df = daft.read_json([f1name, f2name])
+        df = daft.read_json([f1name, f2name], use_native_downloader=use_native_downloader)
         assert df.column_names == COL_NAMES
 
         pd_df = df.to_pandas()
@@ -643,7 +645,8 @@ def test_create_dataframe_multiple_jsons(valid_data: list[dict[str, float]]) ->
         assert len(pd_df) == (len(valid_data) * 2)
 
 
-def test_create_dataframe_json_column_projection(valid_data: list[dict[str, float]]) -> None:
+@pytest.mark.parametrize("use_native_downloader", [True, False])
+def test_create_dataframe_json_column_projection(valid_data: list[dict[str, float]], use_native_downloader) -> None:
     with create_temp_filename() as fname:
         with open(fname, "w") as f:
             for data in valid_data:
@@ -653,7 +656,7 @@ def test_create_dataframe_json_column_projection(valid_data: list[dict[str, floa
 
         col_subset = COL_NAMES[:3]
 
-        df = daft.read_json(fname)
+        df = daft.read_json(fname, use_native_downloader=use_native_downloader)
         df = df.select(*col_subset)
         assert df.column_names == col_subset
 
@@ -662,14 +665,21 @@ def test_create_dataframe_json_column_projection(valid_data: list[dict[str, floa
         assert len(pd_df) == len(valid_data)
 
 
+# TODO(Clark): Debug why this segfaults for the native downloader and is slow for the Python downloader.
+# @pytest.mark.parametrize("use_native_downloader", [True, False])
+@pytest.mark.skip
 def test_create_dataframe_json_https() -> None:
-    df = daft.read_json("https://github.com/Eventual-Inc/mnist-json/raw/master/mnist_handwritten_test.json.gz")
+    df = daft.read_json(
+        "https://github.com/Eventual-Inc/mnist-json/raw/master/mnist_handwritten_test.json.gz",
+        # use_native_downloader=use_native_downloader,
+    )
     df.collect()
     assert set(df.column_names) == {"label", "image"}
     assert len(df) == 10000
 
 
-def test_create_dataframe_json_specify_schema(valid_data: list[dict[str, float]]) -> None:
+@pytest.mark.parametrize("use_native_downloader", [True, False])
+def test_create_dataframe_json_specify_schema(valid_data: list[dict[str, float]], use_native_downloader) -> None:
     with create_temp_filename() as fname:
         with open(fname, "w") as f:
             for data in valid_data:
@@ -686,6 +696,7 @@ def test_create_dataframe_json_specify_schema(valid_data: list[dict[str, float]]
                 "petal_width": DataType.float32(),
                 "variety": DataType.string(),
             },
+            use_native_downloader=use_native_downloader,
         )
         assert df.column_names == COL_NAMES
 
diff --git a/tests/table/table_io/test_json.py b/tests/table/table_io/test_json.py
index 74989eae1f..b6bdf88608 100644
--- a/tests/table/table_io/test_json.py
+++ b/tests/table/table_io/test_json.py
@@ -1,20 +1,31 @@
 from __future__ import annotations
 
-import io
+import contextlib
 import json
+import os
 import pathlib
+import tempfile
 from typing import Any
 
 import pytest
 
 import daft
+from daft.daft import NativeStorageConfig, PythonStorageConfig, StorageConfig
 from daft.datatype import DataType
 from daft.logical.schema import Schema
 from daft.runners.partitioning import TableReadOptions
 from daft.table import MicroPartition, schema_inference, table_io
 
 
-def test_read_input(tmpdir):
+def storage_config_from_use_native_downloader(use_native_downloader: bool) -> StorageConfig:
+    if use_native_downloader:
+        return StorageConfig.native(NativeStorageConfig(True, None))
+    else:
+        return StorageConfig.python(PythonStorageConfig(None))
+
+
+@pytest.mark.parametrize("use_native_downloader", [True, False])
+def test_read_input(tmpdir, use_native_downloader):
     tmpdir = pathlib.Path(tmpdir)
     data = {"foo": [1, 2, 3]}
     with open(tmpdir / "file.json", "w") as f:
@@ -23,27 +34,32 @@ def test_read_input(tmpdir):
             f.write("\n")
 
     schema = Schema._from_field_name_and_types([("foo", DataType.int64())])
+    storage_config = storage_config_from_use_native_downloader(use_native_downloader)
 
     # Test pathlib, str and IO
-    assert table_io.read_json(tmpdir / "file.json", schema=schema).to_pydict() == data
-    assert table_io.read_json(str(tmpdir / "file.json"), schema=schema).to_pydict() == data
+    assert table_io.read_json(tmpdir / "file.json", schema=schema, storage_config=storage_config).to_pydict() == data
+    assert (
+        table_io.read_json(str(tmpdir / "file.json"), schema=schema, storage_config=storage_config).to_pydict() == data
+    )
 
     with open(tmpdir / "file.json", "rb") as f:
-        assert table_io.read_json(f, schema=schema).to_pydict() == data
+        if use_native_downloader:
+            f = tmpdir / "file.json"
+        assert table_io.read_json(f, schema=schema, storage_config=storage_config).to_pydict() == data
 
 
+@contextlib.contextmanager
 def _json_write_helper(data: dict[str, list[Any]]):
-    first_key = list(data.keys())[0]
-    data_len = len(data[first_key])
-    for k in data:
-        assert len(data[k]) == data_len
-
-    f = io.StringIO()
-    for i in range(data_len):
-        json.dump({k: data[k][i] for k in data}, f)
-        f.write("\n")
-    f.seek(0)
-    return io.BytesIO(f.getvalue().encode("utf-8"))
+    with tempfile.TemporaryDirectory() as directory_name:
+        first_key = list(data.keys())[0]
+        data_len = len(data[first_key])
+        for k in data:
+            assert len(data[k]) == data_len
+        file = os.path.join(directory_name, "tempfile")
+        with open(file, "w", newline="") as f:
+            for i in range(data_len):
+                f.write(json.dumps({k: data[k][i] for k in data}) + "\n")
+        yield file
 
 
 @pytest.mark.parametrize(
@@ -62,16 +78,17 @@ def _json_write_helper(data: dict[str, list[Any]]):
         ([1, None, 2], DataType.list(DataType.int64())),
     ],
 )
-def test_json_infer_schema(data, expected_dtype):
-    f = _json_write_helper(
+@pytest.mark.parametrize("use_native_downloader", [True, False])
+def test_json_infer_schema(data, expected_dtype, use_native_downloader):
+    with _json_write_helper(
         {
             "id": [1, 2, 3],
             "data": [data, data, None],
         }
-    )
-
-    schema = schema_inference.from_json(f)
-    assert schema == Schema._from_field_name_and_types([("id", DataType.int64()), ("data", expected_dtype)])
+    ) as f:
+        storage_config = storage_config_from_use_native_downloader(use_native_downloader)
+        schema = schema_inference.from_json(f, storage_config=storage_config)
+        assert schema == Schema._from_field_name_and_types([("id", DataType.int64()), ("data", expected_dtype)])
 
 
 @pytest.mark.parametrize(
@@ -85,57 +102,64 @@ def test_json_infer_schema(data, expected_dtype):
         ({"foo": 1}, daft.Series.from_pylist([{"foo": 1}, {"foo": 1}, None])),
     ],
 )
-def test_json_read_data(data, expected_data_series):
-    f = _json_write_helper(
+@pytest.mark.parametrize("use_native_downloader", [True, False])
+def test_json_read_data(data, expected_data_series, use_native_downloader):
+    with _json_write_helper(
         {
             "id": [1, 2, 3],
             "data": [data, data, None],
         }
-    )
-
-    schema = Schema._from_field_name_and_types([("id", DataType.int64()), ("data", expected_data_series.datatype())])
-    expected = MicroPartition.from_pydict(
-        {
-            "id": [1, 2, 3],
-            "data": expected_data_series,
-        }
-    )
-    table = table_io.read_json(f, schema)
-    assert table.to_arrow() == expected.to_arrow(), f"Expected:\n{expected}\n\nReceived:\n{table}"
-
-
-def test_json_read_data_limit_rows():
-    f = _json_write_helper(
+    ) as f:
+        schema = Schema._from_field_name_and_types(
+            [("id", DataType.int64()), ("data", expected_data_series.datatype())]
+        )
+        expected = MicroPartition.from_pydict(
+            {
+                "id": [1, 2, 3],
+                "data": expected_data_series,
+            }
+        )
+        storage_config = storage_config_from_use_native_downloader(use_native_downloader)
+        table = table_io.read_json(f, schema, storage_config=storage_config)
+        assert table.to_arrow() == expected.to_arrow(), f"Expected:\n{expected}\n\nReceived:\n{table}"
+
+
+@pytest.mark.parametrize("use_native_downloader", [True, False])
+def test_json_read_data_limit_rows(use_native_downloader):
+    with _json_write_helper(
         {
             "id": [1, 2, 3],
             "data": [1, 2, None],
         }
-    )
-
-    schema = Schema._from_field_name_and_types([("id", DataType.int64()), ("data", DataType.int64())])
-    expected = MicroPartition.from_pydict(
-        {
-            "id": [1, 2],
-            "data": [1, 2],
-        }
-    )
-    table = table_io.read_json(f, schema, read_options=TableReadOptions(num_rows=2))
-    assert table.to_arrow() == expected.to_arrow(), f"Expected:\n{expected}\n\nReceived:\n{table}"
-
-
-def test_json_read_data_select_columns():
-    f = _json_write_helper(
+    ) as f:
+        schema = Schema._from_field_name_and_types([("id", DataType.int64()), ("data", DataType.int64())])
+        expected = MicroPartition.from_pydict(
+            {
+                "id": [1, 2],
+                "data": [1, 2],
+            }
+        )
+        storage_config = storage_config_from_use_native_downloader(use_native_downloader)
+        table = table_io.read_json(f, schema, read_options=TableReadOptions(num_rows=2), storage_config=storage_config)
+        assert table.to_arrow() == expected.to_arrow(), f"Expected:\n{expected}\n\nReceived:\n{table}"
+
+
+@pytest.mark.parametrize("use_native_downloader", [True, False])
+def test_json_read_data_select_columns(use_native_downloader):
+    with _json_write_helper(
         {
             "id": [1, 2, 3],
             "data": [1, 2, None],
         }
-    )
-
-    schema = Schema._from_field_name_and_types([("id", DataType.int64()), ("data", DataType.int64())])
-    expected = MicroPartition.from_pydict(
-        {
-            "data": [1, 2, None],
-        }
-    )
-    table = table_io.read_json(f, schema, read_options=TableReadOptions(column_names=["data"]))
-    assert table.to_arrow() == expected.to_arrow(), f"Expected:\n{expected}\n\nReceived:\n{table}"
+    ) as f:
+        schema = Schema._from_field_name_and_types([("id", DataType.int64()), ("data", DataType.int64())])
+        expected = MicroPartition.from_pydict(
+            {
+                "data": [1, 2, None],
+            }
+        )
+        storage_config = storage_config_from_use_native_downloader(use_native_downloader)
+        table = table_io.read_json(
+            f, schema, read_options=TableReadOptions(column_names=["data"]), storage_config=storage_config
+        )
+        assert table.to_arrow() == expected.to_arrow(), f"Expected:\n{expected}\n\nReceived:\n{table}"