Eventual-Inc · samster25 · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024
diff --git a/src/daft-io/src/lib.rs b/src/daft-io/src/lib.rs
@@ -276,7 +276,7 @@ pub fn parse_url(input: &str) -> Result<(SourceType, Cow<'_, str>)> {
     match scheme.as_ref() {
         "file" => Ok((SourceType::File, fixed_input)),
         "http" | "https" => Ok((SourceType::Http, fixed_input)),
-        "s3" => Ok((SourceType::S3, fixed_input)),
+        "s3" | "s3a" => Ok((SourceType::S3, fixed_input)),
         "az" | "abfs" => Ok((SourceType::AzureBlob, fixed_input)),
         "gcs" | "gs" => Ok((SourceType::GCS, fixed_input)),
         #[cfg(target_env = "msvc")]

diff --git a/tests/integration/io/conftest.py b/tests/integration/io/conftest.py
@@ -227,4 +227,6 @@ def minio_image_data_fixture(minio_io_config, image_data_folder) -> YieldFixture
 @pytest.fixture(scope="session")
 def small_images_s3_paths() -> list[str]:
     """Paths to small *.jpg files in a public S3 bucket"""
-    return [f"s3://daft-public-data/test_fixtures/small_images/rickroll{i}.jpg" for i in range(6)]
+    return [f"s3://daft-public-data/test_fixtures/small_images/rickroll{i}.jpg" for i in range(6)] + [
+        f"s3a://daft-public-data/test_fixtures/small_images/rickroll{i}.jpg" for i in range(6)
+    ]
diff --git a/tests/integration/io/parquet/test_reads_public_data.py b/tests/integration/io/parquet/test_reads_public_data.py
@@ -166,6 +166,10 @@ def get_filesystem_from_path(path: str, **kwargs) -> fsspec.AbstractFileSystem:
         "parquet-benchmarking/mvp",
         "s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet",
     ),
+    (
+        "parquet-benchmarking/s3a-mvp",
+        "s3a://daft-public-data/test_fixtures/parquet-dev/mvp.parquet",
+    ),
     (
         "azure/mvp",
         "az://public-anonymous/mvp.parquet",
@@ -198,7 +202,7 @@ def parquet_file(request) -> tuple[str, str]:
 
 def read_parquet_with_pyarrow(path) -> pa.Table:
     kwargs = {}
-    if get_protocol_from_path(path) == "s3":
+    if get_protocol_from_path(path) == "s3" or get_protocol_from_path(path) == "s3a":
         kwargs["anon"] = True
     if get_protocol_from_path(path) == "az":
         kwargs["account_name"] = "dafttestdata"

diff --git a/tests/integration/io/test_list_files_s3_minio.py b/tests/integration/io/test_list_files_s3_minio.py
@@ -37,6 +37,8 @@ def s3fs_recursive_list(fs, path) -> list:
     [
         # Exact filepath:
         (f"s3://bucket/a.match", [{"type": "File", "path": "s3://bucket/a.match", "size": 0}]),
+        # Exact filepath but with s3a:
+        (f"s3a://bucket/a.match", [{"type": "File", "path": "s3a://bucket/a.match", "size": 0}]),
         ###
         # `**`: recursive wildcard
         ###
@@ -55,6 +57,21 @@ def s3fs_recursive_list(fs, path) -> list:
                 {"type": "File", "path": "s3://bucket/nested2/c.match", "size": 0},
             ],
         ),
+        # All files with s3a and **
+        (
+            f"s3a://bucket/**",
+            [
+                {"type": "File", "path": "s3a://bucket/a.match", "size": 0},
+                {"type": "File", "path": "s3a://bucket/b.nomatch", "size": 0},
+                {"type": "File", "path": "s3a://bucket/c.match", "size": 0},
+                {"type": "File", "path": "s3a://bucket/nested1/a.match", "size": 0},
+                {"type": "File", "path": "s3a://bucket/nested1/b.nomatch", "size": 0},
+                {"type": "File", "path": "s3a://bucket/nested1/c.match", "size": 0},
+                {"type": "File", "path": "s3a://bucket/nested2/a.match", "size": 0},
+                {"type": "File", "path": "s3a://bucket/nested2/b.nomatch", "size": 0},
+                {"type": "File", "path": "s3a://bucket/nested2/c.match", "size": 0},
+            ],
+        ),
         # Exact filepath after **
         (
             f"s3://bucket/**/a.match",

diff --git a/tests/integration/io/test_url_download_private_aws_s3.py b/tests/integration/io/test_url_download_private_aws_s3.py
@@ -30,7 +30,7 @@ def test_url_download_aws_s3_public_bucket_with_creds(small_images_s3_paths, io_
     df = df.with_column("data", df["urls"].url.download(use_native_downloader=True, io_config=io_config))
 
     data = df.to_pydict()
-    assert len(data["data"]) == 6
+    assert len(data["data"]) == 12
     for img_bytes in data["data"]:
         assert img_bytes is not None
 

diff --git a/tests/integration/io/test_url_download_public_aws_s3.py b/tests/integration/io/test_url_download_public_aws_s3.py
@@ -14,7 +14,7 @@ def test_url_download_aws_s3_public_bucket_custom_s3fs(small_images_s3_paths):
     )
 
     data = df.to_pydict()
-    assert len(data["data"]) == 6
+    assert len(data["data"]) == 12
     for img_bytes in data["data"]:
         assert img_bytes is not None
 
@@ -28,7 +28,7 @@ def test_url_download_aws_s3_public_bucket_custom_s3fs_wrong_region(small_images
     )
 
     data = df.to_pydict()
-    assert len(data["data"]) == 6
+    assert len(data["data"]) == 12
     for img_bytes in data["data"]:
         assert img_bytes is not None
 
@@ -40,7 +40,7 @@ def test_url_download_aws_s3_public_bucket_native_downloader(aws_public_s3_confi
     df = df.with_column("data", df["urls"].url.download(io_config=aws_public_s3_config, use_native_downloader=True))
 
     data = df.to_pydict()
-    assert len(data["data"]) == 6
+    assert len(data["data"]) == 12
     for img_bytes in data["data"]:
         assert img_bytes is not None
 
@@ -54,15 +54,15 @@ def test_url_download_aws_s3_public_bucket_native_downloader_io_thread_change(
     df = df.with_column("data", df["urls"].url.download(io_config=aws_public_s3_config, use_native_downloader=True))
 
     data = df.to_pydict()
-    assert len(data["data"]) == 6
+    assert len(data["data"]) == 12
     for img_bytes in data["data"]:
         assert img_bytes is not None
     daft.io.set_io_pool_num_threads(2)
     df = daft.from_pydict(data)
     df = df.with_column("data", df["urls"].url.download(io_config=aws_public_s3_config, use_native_downloader=True))
 
     data = df.to_pydict()
-    assert len(data["data"]) == 6
+    assert len(data["data"]) == 12
     for img_bytes in data["data"]:
         assert img_bytes is not None