Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance UTF-8 support for filename handling in downloads #19161

Merged
merged 6 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions lib/galaxy/datatypes/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
FILENAME_VALID_CHARS,
inflector,
iter_start_of_line,
toContentDisposition,
unicodify,
UNKNOWN,
)
Expand Down Expand Up @@ -437,7 +438,7 @@ def _serve_raw(
element_identifier=kwd.get("element_identifier"),
filename_pattern=kwd.get("filename_pattern"),
)
headers["Content-Disposition"] = f'attachment; filename="{filename}"'
headers["Content-Disposition"] = toContentDisposition(filename)
return open(dataset.get_file_name(), mode="rb"), headers

def to_archive(self, dataset: DatasetProtocol, name: str = "") -> Iterable:
Expand Down Expand Up @@ -483,7 +484,7 @@ def _serve_file_download(self, headers, data, trans, to_ext, file_size, **kwd):
headers["content-type"] = (
"application/octet-stream" # force octet-stream so Safari doesn't append mime extensions to filename
)
headers["Content-Disposition"] = f'attachment; filename="{filename}"'
headers["Content-Disposition"] = toContentDisposition(filename)
return open(data.get_file_name(), "rb"), headers

def _serve_binary_file_contents_as_text(self, trans, data, headers, file_size, max_peek_size):
Expand Down Expand Up @@ -660,16 +661,13 @@ def _download_filename(
element_identifier: Optional[str] = None,
filename_pattern: Optional[str] = None,
) -> str:
def escape(raw_identifier):
return "".join(c in FILENAME_VALID_CHARS and c or "_" for c in raw_identifier)[0:150]

if not to_ext or to_ext == "data":
# If a client requests to_ext with the extension 'data', they are
# deferring to the server, set it based on datatype.
to_ext = dataset.extension

template_values = {
"name": escape(dataset.name),
"name": dataset.name,
"ext": to_ext,
"hid": dataset.hid,
}
Expand All @@ -682,8 +680,9 @@ def escape(raw_identifier):

if hdca is not None:
# Use collection context to build up filename.
template_values["element_identifier"] = element_identifier
template_values["hdca_name"] = escape(hdca.name)
if element_identifier is not None:
template_values["element_identifier"] = element_identifier
template_values["hdca_name"] = hdca.name
template_values["hdca_hid"] = hdca.hid

return string.Template(filename_pattern).substitute(**template_values)
Expand Down
7 changes: 7 additions & 0 deletions lib/galaxy/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
Union,
)
from urllib.parse import (
quote,
urlencode,
urlparse,
urlsplit,
Expand Down Expand Up @@ -2006,3 +2007,9 @@ def lowercase_alphanum_to_hex(lowercase_alphanum: str) -> str:
import numpy as np

return np.base_repr(int(lowercase_alphanum, 36), 16).lower()


def toContentDisposition(filename: str) -> str:
sanitized_filename = "".join(c in FILENAME_VALID_CHARS and c or "_" for c in filename)[0:150]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You will also need to apply the file length limit and exclude certain characters inutf8_encoded_filename as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and the name should be to_content_disposition or similar, sorry my brain was in frontend mode when I suggested the change 😄

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, also the limit is applied to 150 characters shouldn't it be 255?

Copy link
Collaborator Author

@arash77 arash77 Nov 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And by exclude certain characters you mean which characters?
Should this be excluded? /\?%*:|"<>

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't find any specific limitation on characters from the link but I found out that for a typical filename, the characters limitation is 256 characters and /\?%*:|"<> are not allowed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. I bet most browsers are putting proper restrictions in place anyway.

utf8_encoded_filename = quote(filename, safe="")
return f"attachment; filename=\"{sanitized_filename}\"; filename*=UTF-8''{utf8_encoded_filename}"
4 changes: 2 additions & 2 deletions lib/galaxy/util/zipstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import zipstream

from galaxy.util import toContentDisposition
from .path import safe_walk

CRC32_MIN = 1444
Expand Down Expand Up @@ -41,8 +42,7 @@ def response(self) -> Iterator[bytes]:
def get_headers(self) -> Dict[str, str]:
headers = {}
if self.archive_name:
archive_name = self.archive_name.encode("latin-1", "replace").decode("latin-1")
headers["Content-Disposition"] = f'attachment; filename="{archive_name}.zip"'
headers["Content-Disposition"] = toContentDisposition(f"{self.archive_name}.zip")
if self.upstream_mod_zip:
headers["X-Archive-Files"] = "zip"
else:
Expand Down
2 changes: 2 additions & 0 deletions lib/galaxy_test/api/test_dataset_collections.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import zipfile
from io import BytesIO
from typing import List
from urllib.parse import quote

from galaxy.util.unittest_utils import skip_if_github_down
from galaxy_test.base.api_asserts import assert_object_id_error
Expand Down Expand Up @@ -189,6 +190,7 @@ def test_download_non_english_characters(self):
hdca_id = self.dataset_populator.fetch(payload, wait=True).json()["outputs"][0]["id"]
create_response = self._download_dataset_collection(history_id=history_id, hdca_id=hdca_id)
self._assert_status_code_is(create_response, 200)
assert quote(name, safe="") in create_response.headers["Content-Disposition"]

@requires_new_user
def test_hda_security(self):
Expand Down
8 changes: 8 additions & 0 deletions lib/galaxy_test/api/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
Dict,
List,
)
from urllib.parse import quote

from galaxy.model.unittest_utils.store_fixtures import (
deferred_hda_model_store_dict,
Expand Down Expand Up @@ -897,3 +898,10 @@ def test_cannot_update_datatype_on_immutable_history(self, history_id):
response = self._put(f"histories/{history_id}/contents/{hda_id}", data={"datatype": "tabular"}, json=True)
self._assert_status_code_is(response, 403)
assert response.json()["err_msg"] == "History is immutable"

def test_download_non_english_characters(self, history_id):
name = "دیتاست"
hda = self.dataset_populator.new_dataset(history_id=history_id, name=name, content="data", wait=True)
response = self._get(f"histories/{history_id}/contents/{hda['id']}/display?to_ext=json")
self._assert_status_code_is(response, 200)
assert quote(name, safe="") in response.headers["Content-Disposition"]
Loading