Skip to content

Commit

Permalink
Add tests for absolute and absolute-base URL cases
Browse files Browse the repository at this point in the history
  • Loading branch information
Jay Chia committed Sep 26, 2023
1 parent 966cbee commit c459e94
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 10 deletions.
18 changes: 13 additions & 5 deletions src/daft-io/src/http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,12 @@ enum Error {
/// This function will look for `<a href=***>` tags and return all the links that it finds as
/// absolute URLs
fn _get_file_metadata_from_html(path: &str, text: &str) -> super::Result<Vec<FileMetadata>> {
let path = format!("{}/", path.trim_end_matches('/')); // Ensure suffix is a single '/' so that it properly works with Url::join
let path_url = url::Url::parse(path.as_str()).with_context(|_| InvalidUrlSnafu { path })?;
let path_url = url::Url::parse(path).with_context(|_| InvalidUrlSnafu { path })?;
let metas = HTML_A_TAG_HREF_RE
.captures_iter(text)
.map(|captures| {
// Parse the matched URL into an absolute URL
let matched_url = captures.name("url").unwrap().as_str();

let absolute_path = if let Ok(parsed_matched_url) = url::Url::parse(matched_url) {
// matched_url is already an absolute path
parsed_matched_url
Expand All @@ -91,7 +90,7 @@ fn _get_file_metadata_from_html(path: &str, text: &str) -> super::Result<Vec<Fil
base.join(matched_url)
.with_context(|_| InvalidUrlSnafu { path: matched_url })?
} else {
// matched_url is a path relative to `path`
// matched_url is a path relative to `path` and needs to be joined
path_url
.join(matched_url)
.with_context(|_| InvalidUrlSnafu { path: matched_url })?
Expand Down Expand Up @@ -237,6 +236,15 @@ impl ObjectSource for HttpSource {
.error_for_status()
.with_context(|_| UnableToOpenFileSnafu { path })?;

// Reconstruct the actual path of the request, which may have been redirected via a 301
// This is important because downstream URL joining logic relies on proper trailing-slashes/index.html
let path = response.url().to_string();
let path = if path.ends_with('/') {
format!("{}/", path.trim_end_matches('/'))
} else {
path
};

match response.headers().get("content-type") {
// If the content-type is text/html, we treat the data on this path as a traversable "directory"
Some(header_value) if header_value.to_str().map_or(false, |v| v == "text/html") => {
Expand All @@ -246,7 +254,7 @@ impl ObjectSource for HttpSource {
.with_context(|_| UnableToParseUtf8BodySnafu {
path: path.to_string(),
})?;
let file_metadatas = _get_file_metadata_from_html(path, text.as_str())?;
let file_metadatas = _get_file_metadata_from_html(path.as_str(), text.as_str())?;
Ok(LSResult {
files: file_metadatas,
continuation_token: None,
Expand Down
64 changes: 59 additions & 5 deletions tests/integration/io/test_list_files_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ def nginx_http_url(nginx_config, tmpdir_factory):
[
f"",
f"/",
f"test_ls",
f"test_ls/",
f"test_ls//",
f"test_ls/paginated-10-files/",
f"/test_ls",
f"/test_ls/",
f"/test_ls//",
f"/test_ls/paginated-10-files/",
],
)
def test_http_flat_directory_listing(path, nginx_http_url):
http_path = f"{nginx_http_url}/{path}"
http_path = f"{nginx_http_url}{path}"
fs = HTTPFileSystem()
fsspec_result = fs.ls(http_path, detail=True)
daft_ls_result = io_list(http_path)
Expand Down Expand Up @@ -90,3 +90,57 @@ def test_http_flat_directory_listing_recursive(path, nginx_http_url):
fsspec_result = list(fs.glob(http_path.rstrip("/") + "/**", detail=True).values())
daft_ls_result = io_list(http_path, recursive=True)
compare_http_result(daft_ls_result, fsspec_result)


@pytest.mark.integration()
def test_http_listing_absolute_urls(nginx_config, tmpdir):
nginx_http_url, _ = nginx_config

tmpdir = Path(tmpdir)
test_manifest_file = tmpdir / "index.html"
test_manifest_file.write_text(
f"""
<a href="{nginx_http_url}/other.html">this is an absolute path to a file</a>
<a href="{nginx_http_url}/dir/">this is an absolute path to a dir</a>
"""
)

with mount_data_nginx(nginx_config, tmpdir):
http_path = f"{nginx_http_url}/index.html"
daft_ls_result = io_list(http_path, recursive=False)

# NOTE: Cannot use fsspec here because they do not correctly find the links
# fsspec_result = fs.ls(http_path, detail=True)
# compare_http_result(daft_ls_result, fsspec_result)

assert daft_ls_result == [
{"type": "File", "path": f"{nginx_http_url}/other.html", "size": None},
{"type": "Directory", "path": f"{nginx_http_url}/dir/", "size": None},
]


@pytest.mark.integration()
def test_http_listing_absolute_base_urls(nginx_config, tmpdir):
nginx_http_url, _ = nginx_config

tmpdir = Path(tmpdir)
test_manifest_file = tmpdir / "index.html"
test_manifest_file.write_text(
f"""
<a href="/other.html">this is an absolute base path to a file</a>
<a href="/dir/">this is an absolute base path to a dir</a>
"""
)

with mount_data_nginx(nginx_config, tmpdir):
http_path = f"{nginx_http_url}/index.html"
daft_ls_result = io_list(http_path, recursive=False)

# NOTE: Cannot use fsspec here because they do not correctly find the links
# fsspec_result = fs.ls(http_path, detail=True)
# compare_http_result(daft_ls_result, fsspec_result)

assert daft_ls_result == [
{"type": "File", "path": f"{nginx_http_url}/other.html", "size": None},
{"type": "Directory", "path": f"{nginx_http_url}/dir/", "size": None},
]

0 comments on commit c459e94

Please sign in to comment.