Skip to content

Commit

Permalink
Add proper error handling for response
Browse files Browse the repository at this point in the history
  • Loading branch information
Jay Chia committed Sep 22, 2023
1 parent 2164549 commit 60eaaf3
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 50 deletions.
5 changes: 4 additions & 1 deletion src/daft-io/src/http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,10 @@ impl ObjectSource for HttpSource {
let response = request
.send()
.await
.context(UnableToConnectSnafu::<String> { path: path.into() })?;
.context(UnableToConnectSnafu::<String> { path: path.into() })?
.error_for_status()
.with_context(|_| UnableToOpenFileSnafu { path })?;

match response.headers().get("content-type") {
// If the content-type is text/html, we treat the data on this path as a traversable "directory"
Some(header_value) if header_value.to_str().map_or(false, |v| v == "text/html") => {
Expand Down
88 changes: 39 additions & 49 deletions tests/integration/io/test_list_files_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,6 @@
def compare_http_result(daft_ls_result: list, fsspec_result: list):
daft_files = [(f["path"], f["type"].lower()) for f in daft_ls_result]
httpfs_files = [(f["name"], f["type"]) for f in fsspec_result]

# Perform necessary post-processing of fsspec results to match expected behavior from Daft:

# # NOTE: gcsfs sometimes does not return the trailing / for directories, so we have to ensure it
# gcsfs_files = [
# (f"{path.rstrip('/')}/", type_) if type_ == "directory" else (path, type_) for path, type_ in gcsfs_files
# ]

# # NOTE: gcsfs will sometimes return 0-sized marker files for manually-created folders, which we ignore here
# # Be careful here because this will end up pruning any truly size-0 files that are actually files and not folders!
# size_0_files = {f"gs://{f['name']}" for f in fsspec_result if f["size"] == 0 and f["type"] == "file"}
# gcsfs_files = [(path, type_) for path, type_ in gcsfs_files if path not in size_0_files]

assert len(daft_files) == len(httpfs_files)
assert sorted(daft_files) == sorted(httpfs_files)

Expand Down Expand Up @@ -63,39 +50,42 @@ def test_http_flat_directory_listing(path, nginx_http_url):
compare_http_result(daft_ls_result, fsspec_result)


# @pytest.mark.integration()
# def test_gs_single_file_listing():
# path = f"gs://{BUCKET}/test_ls/file.txt"
# fs = gcsfs.GCSFileSystem()
# daft_ls_result = io_list(path)
# fsspec_result = fs.ls(path, detail=True)
# compare_http_result(daft_ls_result, fsspec_result)


# @pytest.mark.integration()
# def test_gs_notfound():
# path = f"gs://{BUCKET}/test_ls/MISSING"
# fs = gcsfs.GCSFileSystem()
# with pytest.raises(FileNotFoundError):
# fs.ls(path, detail=True)

# # NOTE: Google Cloud does not return a 404 to indicate anything missing, but just returns empty results
# # Thus Daft is unable to differentiate between "missing" folders and "empty" folders
# daft_ls_result = io_list(path)
# assert daft_ls_result == []


# @pytest.mark.integration()
# @pytest.mark.parametrize(
# "path",
# [
# f"",
# f"/",
# ],
# )
# def test_http_flat_directory_listing_recursive(path, nginx_http_url):
# http_path = f"{nginx_http_url}/{path}"
# fs = HTTPFileSystem()
# fsspec_result = list(fs.glob(http_path.rstrip("/") + "/**", detail=True).values())
# # daft_ls_result = io_list(http_path, recursive=True)
# # compare_http_result(daft_ls_result, fsspec_result)
@pytest.mark.integration()
def test_gs_single_file_listing(nginx_http_url):
path = f"{nginx_http_url}/test_ls/file.txt"
daft_ls_result = io_list(path)

# NOTE: FSSpec will return size 0 list for this case, but we want to return 1 element to be
# consistent with behavior of our other file listing utilities
# fs = HTTPFileSystem()
# fsspec_result = fs.ls(path, detail=True)

assert len(daft_ls_result) == 1
assert daft_ls_result[0] == {"path": path, "size": 24, "type": "File"}


@pytest.mark.integration()
def test_http_notfound(nginx_http_url):
path = f"{nginx_http_url}/test_ls/MISSING"
fs = HTTPFileSystem()
with pytest.raises(FileNotFoundError, match=path):
fs.ls(path, detail=True)

with pytest.raises(FileNotFoundError, match=path):
io_list(path)


@pytest.mark.integration()
@pytest.mark.parametrize(
"path",
[
f"",
f"/",
],
)
def test_http_flat_directory_listing_recursive(path, nginx_http_url):
http_path = f"{nginx_http_url}/{path}"
fs = HTTPFileSystem()
fsspec_result = list(fs.glob(http_path.rstrip("/") + "/**", detail=True).values())
daft_ls_result = io_list(http_path, recursive=True)
compare_http_result(daft_ls_result, fsspec_result)

0 comments on commit 60eaaf3

Please sign in to comment.