From 60eaaf35d585dd27dfa26f606d907b790b17663b Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Fri, 22 Sep 2023 11:43:22 -0700 Subject: [PATCH] Add proper error handling for response --- src/daft-io/src/http.rs | 5 +- tests/integration/io/test_list_files_http.py | 88 +++++++++----------- 2 files changed, 43 insertions(+), 50 deletions(-) diff --git a/src/daft-io/src/http.rs b/src/daft-io/src/http.rs index 8761245ba0..b1c59e6621 100644 --- a/src/daft-io/src/http.rs +++ b/src/daft-io/src/http.rs @@ -225,7 +225,10 @@ impl ObjectSource for HttpSource { let response = request .send() .await - .context(UnableToConnectSnafu:: { path: path.into() })?; + .context(UnableToConnectSnafu:: { path: path.into() })? + .error_for_status() + .with_context(|_| UnableToOpenFileSnafu { path })?; + match response.headers().get("content-type") { // If the content-type is text/html, we treat the data on this path as a traversable "directory" Some(header_value) if header_value.to_str().map_or(false, |v| v == "text/html") => { diff --git a/tests/integration/io/test_list_files_http.py b/tests/integration/io/test_list_files_http.py index 9f31b9c93d..6243ae6385 100644 --- a/tests/integration/io/test_list_files_http.py +++ b/tests/integration/io/test_list_files_http.py @@ -12,19 +12,6 @@ def compare_http_result(daft_ls_result: list, fsspec_result: list): daft_files = [(f["path"], f["type"].lower()) for f in daft_ls_result] httpfs_files = [(f["name"], f["type"]) for f in fsspec_result] - - # Perform necessary post-processing of fsspec results to match expected behavior from Daft: - - # # NOTE: gcsfs sometimes does not return the trailing / for directories, so we have to ensure it - # gcsfs_files = [ - # (f"{path.rstrip('/')}/", type_) if type_ == "directory" else (path, type_) for path, type_ in gcsfs_files - # ] - - # # NOTE: gcsfs will sometimes return 0-sized marker files for manually-created folders, which we ignore here - # # Be careful here because this will end up pruning any truly size-0 files that are actually files and not folders! - # size_0_files = {f"gs://{f['name']}" for f in fsspec_result if f["size"] == 0 and f["type"] == "file"} - # gcsfs_files = [(path, type_) for path, type_ in gcsfs_files if path not in size_0_files] - assert len(daft_files) == len(httpfs_files) assert sorted(daft_files) == sorted(httpfs_files) @@ -63,39 +50,42 @@ def test_http_flat_directory_listing(path, nginx_http_url): compare_http_result(daft_ls_result, fsspec_result) -# @pytest.mark.integration() -# def test_gs_single_file_listing(): -# path = f"gs://{BUCKET}/test_ls/file.txt" -# fs = gcsfs.GCSFileSystem() -# daft_ls_result = io_list(path) -# fsspec_result = fs.ls(path, detail=True) -# compare_http_result(daft_ls_result, fsspec_result) - - -# @pytest.mark.integration() -# def test_gs_notfound(): -# path = f"gs://{BUCKET}/test_ls/MISSING" -# fs = gcsfs.GCSFileSystem() -# with pytest.raises(FileNotFoundError): -# fs.ls(path, detail=True) - -# # NOTE: Google Cloud does not return a 404 to indicate anything missing, but just returns empty results -# # Thus Daft is unable to differentiate between "missing" folders and "empty" folders -# daft_ls_result = io_list(path) -# assert daft_ls_result == [] - - -# @pytest.mark.integration() -# @pytest.mark.parametrize( -# "path", -# [ -# f"", -# f"/", -# ], -# ) -# def test_http_flat_directory_listing_recursive(path, nginx_http_url): -# http_path = f"{nginx_http_url}/{path}" -# fs = HTTPFileSystem() -# fsspec_result = list(fs.glob(http_path.rstrip("/") + "/**", detail=True).values()) -# # daft_ls_result = io_list(http_path, recursive=True) -# # compare_http_result(daft_ls_result, fsspec_result) +@pytest.mark.integration() +def test_gs_single_file_listing(nginx_http_url): + path = f"{nginx_http_url}/test_ls/file.txt" + daft_ls_result = io_list(path) + + # NOTE: FSSpec will return size 0 list for this case, but we want to return 1 element to be + # consistent with behavior of our other file listing utilities + # fs = HTTPFileSystem() + # fsspec_result = fs.ls(path, detail=True) + + assert len(daft_ls_result) == 1 + assert daft_ls_result[0] == {"path": path, "size": 24, "type": "File"} + + +@pytest.mark.integration() +def test_http_notfound(nginx_http_url): + path = f"{nginx_http_url}/test_ls/MISSING" + fs = HTTPFileSystem() + with pytest.raises(FileNotFoundError, match=path): + fs.ls(path, detail=True) + + with pytest.raises(FileNotFoundError, match=path): + io_list(path) + + +@pytest.mark.integration() +@pytest.mark.parametrize( + "path", + [ + f"", + f"/", + ], +) +def test_http_flat_directory_listing_recursive(path, nginx_http_url): + http_path = f"{nginx_http_url}/{path}" + fs = HTTPFileSystem() + fsspec_result = list(fs.glob(http_path.rstrip("/") + "/**", detail=True).values()) + daft_ls_result = io_list(http_path, recursive=True) + compare_http_result(daft_ls_result, fsspec_result)