Add tests for absolute and absolute-base URL cases

Eventual-Inc · Sep 26, 2023 · c459e94 · c459e94
1 parent 966cbee
commit c459e94
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 10 deletions.
diff --git a/src/daft-io/src/http.rs b/src/daft-io/src/http.rs
@@ -75,13 +75,12 @@ enum Error {
 /// This function will look for `<a href=***>` tags and return all the links that it finds as
 /// absolute URLs
 fn _get_file_metadata_from_html(path: &str, text: &str) -> super::Result<Vec<FileMetadata>> {
-    let path = format!("{}/", path.trim_end_matches('/')); // Ensure suffix is a single '/' so that it properly works with Url::join
-    let path_url = url::Url::parse(path.as_str()).with_context(|_| InvalidUrlSnafu { path })?;
+    let path_url = url::Url::parse(path).with_context(|_| InvalidUrlSnafu { path })?;
     let metas = HTML_A_TAG_HREF_RE
         .captures_iter(text)
         .map(|captures| {
+            // Parse the matched URL into an absolute URL
             let matched_url = captures.name("url").unwrap().as_str();
-
             let absolute_path = if let Ok(parsed_matched_url) = url::Url::parse(matched_url) {
                 // matched_url is already an absolute path
                 parsed_matched_url
@@ -91,7 +90,7 @@ fn _get_file_metadata_from_html(path: &str, text: &str) -> super::Result<Vec<Fil
                 base.join(matched_url)
                     .with_context(|_| InvalidUrlSnafu { path: matched_url })?
             } else {
-                // matched_url is a path relative to `path`
+                // matched_url is a path relative to `path` and needs to be joined
                 path_url
                     .join(matched_url)
                     .with_context(|_| InvalidUrlSnafu { path: matched_url })?
@@ -237,6 +236,15 @@ impl ObjectSource for HttpSource {
             .error_for_status()
             .with_context(|_| UnableToOpenFileSnafu { path })?;
 
+        // Reconstruct the actual path of the request, which may have been redirected via a 301
+        // This is important because downstream URL joining logic relies on proper trailing-slashes/index.html
+        let path = response.url().to_string();
+        let path = if path.ends_with('/') {
+            format!("{}/", path.trim_end_matches('/'))
+        } else {
+            path
+        };
+
         match response.headers().get("content-type") {
             // If the content-type is text/html, we treat the data on this path as a traversable "directory"
             Some(header_value) if header_value.to_str().map_or(false, |v| v == "text/html") => {
@@ -246,7 +254,7 @@ impl ObjectSource for HttpSource {
                     .with_context(|_| UnableToParseUtf8BodySnafu {
                         path: path.to_string(),
                     })?;
-                let file_metadatas = _get_file_metadata_from_html(path, text.as_str())?;
+                let file_metadatas = _get_file_metadata_from_html(path.as_str(), text.as_str())?;
                 Ok(LSResult {
                     files: file_metadatas,
                     continuation_token: None,

diff --git a/tests/integration/io/test_list_files_http.py b/tests/integration/io/test_list_files_http.py
@@ -37,14 +37,14 @@ def nginx_http_url(nginx_config, tmpdir_factory):
     [
         f"",
         f"/",
-        f"test_ls",
-        f"test_ls/",
-        f"test_ls//",
-        f"test_ls/paginated-10-files/",
+        f"/test_ls",
+        f"/test_ls/",
+        f"/test_ls//",
+        f"/test_ls/paginated-10-files/",
     ],
 )
 def test_http_flat_directory_listing(path, nginx_http_url):
-    http_path = f"{nginx_http_url}/{path}"
+    http_path = f"{nginx_http_url}{path}"
     fs = HTTPFileSystem()
     fsspec_result = fs.ls(http_path, detail=True)
     daft_ls_result = io_list(http_path)
@@ -90,3 +90,57 @@ def test_http_flat_directory_listing_recursive(path, nginx_http_url):
     fsspec_result = list(fs.glob(http_path.rstrip("/") + "/**", detail=True).values())
     daft_ls_result = io_list(http_path, recursive=True)
     compare_http_result(daft_ls_result, fsspec_result)
+
+
+@pytest.mark.integration()
+def test_http_listing_absolute_urls(nginx_config, tmpdir):
+    nginx_http_url, _ = nginx_config
+
+    tmpdir = Path(tmpdir)
+    test_manifest_file = tmpdir / "index.html"
+    test_manifest_file.write_text(
+        f"""
+        <a href="{nginx_http_url}/other.html">this is an absolute path to a file</a>
+        <a href="{nginx_http_url}/dir/">this is an absolute path to a dir</a>
+    """
+    )
+
+    with mount_data_nginx(nginx_config, tmpdir):
+        http_path = f"{nginx_http_url}/index.html"
+        daft_ls_result = io_list(http_path, recursive=False)
+
+        # NOTE: Cannot use fsspec here because they do not correctly find the links
+        # fsspec_result = fs.ls(http_path, detail=True)
+        # compare_http_result(daft_ls_result, fsspec_result)
+
+        assert daft_ls_result == [
+            {"type": "File", "path": f"{nginx_http_url}/other.html", "size": None},
+            {"type": "Directory", "path": f"{nginx_http_url}/dir/", "size": None},
+        ]
+
+
+@pytest.mark.integration()
+def test_http_listing_absolute_base_urls(nginx_config, tmpdir):
+    nginx_http_url, _ = nginx_config
+
+    tmpdir = Path(tmpdir)
+    test_manifest_file = tmpdir / "index.html"
+    test_manifest_file.write_text(
+        f"""
+        <a href="/other.html">this is an absolute base path to a file</a>
+        <a href="/dir/">this is an absolute base path to a dir</a>
+    """
+    )
+
+    with mount_data_nginx(nginx_config, tmpdir):
+        http_path = f"{nginx_http_url}/index.html"
+        daft_ls_result = io_list(http_path, recursive=False)
+
+        # NOTE: Cannot use fsspec here because they do not correctly find the links
+        # fsspec_result = fs.ls(http_path, detail=True)
+        # compare_http_result(daft_ls_result, fsspec_result)
+
+        assert daft_ls_result == [
+            {"type": "File", "path": f"{nginx_http_url}/other.html", "size": None},
+            {"type": "Directory", "path": f"{nginx_http_url}/dir/", "size": None},
+        ]