From 32d83bc4de994e5505324a0d1eca666a8ec3e260 Mon Sep 17 00:00:00 2001 From: Arash Date: Tue, 19 Nov 2024 15:24:48 +0100 Subject: [PATCH 1/6] Enhance filename handling in data download and zipstream headers to support UTF-8 encoding --- lib/galaxy/datatypes/data.py | 21 ++++++++++++++------- lib/galaxy/util/zipstream.py | 5 ++++- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index 23451220ca58..a88f0d229896 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -19,6 +19,7 @@ TYPE_CHECKING, Union, ) +from urllib.parse import quote from markupsafe import escape from typing_extensions import Literal @@ -430,14 +431,16 @@ def _serve_raw( headers["content-type"] = ( "application/octet-stream" # force octet-stream so Safari doesn't append mime extensions to filename ) - filename = self._download_filename( + filename, utf8_encoded_filename = self._download_filename( dataset, to_ext, hdca=kwd.get("hdca"), element_identifier=kwd.get("element_identifier"), filename_pattern=kwd.get("filename_pattern"), ) - headers["Content-Disposition"] = f'attachment; filename="{filename}"' + headers["Content-Disposition"] = ( + f"attachment; filename=\"{filename}\"; filename*=UTF-8''{utf8_encoded_filename}" + ) return open(dataset.get_file_name(), mode="rb"), headers def to_archive(self, dataset: DatasetProtocol, name: str = "") -> Iterable: @@ -473,7 +476,7 @@ def _serve_file_download(self, headers, data, trans, to_ext, file_size, **kwd): return self._archive_composite_dataset(trans, data, headers, do_action=kwd.get("do_action", "zip")) else: headers["Content-Length"] = str(file_size) - filename = self._download_filename( + filename, utf8_encoded_filename = self._download_filename( data, to_ext, hdca=kwd.get("hdca"), @@ -483,7 +486,9 @@ def _serve_file_download(self, headers, data, trans, to_ext, file_size, **kwd): headers["content-type"] = ( "application/octet-stream" # force octet-stream so Safari doesn't append mime extensions to filename ) - headers["Content-Disposition"] = f'attachment; filename="{filename}"' + headers["Content-Disposition"] = ( + f"attachment; filename=\"{filename}\"; filename*=UTF-8''{utf8_encoded_filename}" + ) return open(data.get_file_name(), "rb"), headers def _serve_binary_file_contents_as_text(self, trans, data, headers, file_size, max_peek_size): @@ -659,7 +664,7 @@ def _download_filename( hdca: Optional[DatasetHasHidProtocol] = None, element_identifier: Optional[str] = None, filename_pattern: Optional[str] = None, - ) -> str: + ) -> Tuple[str, str]: def escape(raw_identifier): return "".join(c in FILENAME_VALID_CHARS and c or "_" for c in raw_identifier)[0:150] @@ -685,8 +690,10 @@ def escape(raw_identifier): template_values["element_identifier"] = element_identifier template_values["hdca_name"] = escape(hdca.name) template_values["hdca_hid"] = hdca.hid - - return string.Template(filename_pattern).substitute(**template_values) + filename = string.Template(filename_pattern).substitute(**template_values) + template_values["name"] = quote(dataset.name, safe="") + utf8_encoded_filename = string.Template(filename_pattern).substitute(**template_values) + return filename, utf8_encoded_filename def display_name(self, dataset: HasName) -> str: """Returns formatted html of dataset name""" diff --git a/lib/galaxy/util/zipstream.py b/lib/galaxy/util/zipstream.py index 1cd1c77649e7..3ab85ac19666 100644 --- a/lib/galaxy/util/zipstream.py +++ b/lib/galaxy/util/zipstream.py @@ -42,7 +42,10 @@ def get_headers(self) -> Dict[str, str]: headers = {} if self.archive_name: archive_name = self.archive_name.encode("latin-1", "replace").decode("latin-1") - headers["Content-Disposition"] = f'attachment; filename="{archive_name}.zip"' + utf8_encoded_filename = quote(self.archive_name, safe="") + headers["Content-Disposition"] = ( + f"attachment; filename=\"{archive_name}.zip\"; filename*=UTF-8''{utf8_encoded_filename}.zip" + ) if self.upstream_mod_zip: headers["X-Archive-Files"] = "zip" else: From 4bc21f885eceea3606302a6a49a603535b04b464 Mon Sep 17 00:00:00 2001 From: Arash Date: Tue, 19 Nov 2024 16:11:29 +0100 Subject: [PATCH 2/6] Add tests for datasets and collections download with non english characters --- lib/galaxy_test/api/test_dataset_collections.py | 2 ++ lib/galaxy_test/api/test_datasets.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/lib/galaxy_test/api/test_dataset_collections.py b/lib/galaxy_test/api/test_dataset_collections.py index c9428ad84ffe..d7710c57b2fa 100644 --- a/lib/galaxy_test/api/test_dataset_collections.py +++ b/lib/galaxy_test/api/test_dataset_collections.py @@ -1,6 +1,7 @@ import zipfile from io import BytesIO from typing import List +from urllib.parse import quote from galaxy.util.unittest_utils import skip_if_github_down from galaxy_test.base.api_asserts import assert_object_id_error @@ -189,6 +190,7 @@ def test_download_non_english_characters(self): hdca_id = self.dataset_populator.fetch(payload, wait=True).json()["outputs"][0]["id"] create_response = self._download_dataset_collection(history_id=history_id, hdca_id=hdca_id) self._assert_status_code_is(create_response, 200) + assert quote(name, safe="") in create_response.headers["Content-Disposition"] @requires_new_user def test_hda_security(self): diff --git a/lib/galaxy_test/api/test_datasets.py b/lib/galaxy_test/api/test_datasets.py index dd9084c8c4f4..87ec35daf517 100644 --- a/lib/galaxy_test/api/test_datasets.py +++ b/lib/galaxy_test/api/test_datasets.py @@ -6,6 +6,7 @@ Dict, List, ) +from urllib.parse import quote from galaxy.model.unittest_utils.store_fixtures import ( deferred_hda_model_store_dict, @@ -897,3 +898,10 @@ def test_cannot_update_datatype_on_immutable_history(self, history_id): response = self._put(f"histories/{history_id}/contents/{hda_id}", data={"datatype": "tabular"}, json=True) self._assert_status_code_is(response, 403) assert response.json()["err_msg"] == "History is immutable" + + def test_download_non_english_characters(self, history_id): + name = "دیتاست" + hda = self.dataset_populator.new_dataset(history_id=history_id, name=name, content="data", wait=True) + response = self._get(f"histories/{history_id}/contents/{hda['id']}/display?to_ext=json") + self._assert_status_code_is(response, 200) + assert quote(name, safe="") in response.headers["Content-Disposition"] From 18a0b927333b42a882d2bea37682f8e6e188a071 Mon Sep 17 00:00:00 2001 From: Arash Date: Wed, 20 Nov 2024 11:11:00 +0100 Subject: [PATCH 3/6] Refactor filename handling in data download to use a dedicated toContentDisposition function for improved UTF-8 support --- lib/galaxy/datatypes/data.py | 32 ++++++++++++-------------------- lib/galaxy/util/__init__.py | 6 ++++++ lib/galaxy/util/zipstream.py | 7 ++----- 3 files changed, 20 insertions(+), 25 deletions(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index a88f0d229896..0677498f68c4 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -19,7 +19,6 @@ TYPE_CHECKING, Union, ) -from urllib.parse import quote from markupsafe import escape from typing_extensions import Literal @@ -51,6 +50,7 @@ FILENAME_VALID_CHARS, inflector, iter_start_of_line, + toContentDisposition, unicodify, UNKNOWN, ) @@ -431,16 +431,14 @@ def _serve_raw( headers["content-type"] = ( "application/octet-stream" # force octet-stream so Safari doesn't append mime extensions to filename ) - filename, utf8_encoded_filename = self._download_filename( + filename = self._download_filename( dataset, to_ext, hdca=kwd.get("hdca"), element_identifier=kwd.get("element_identifier"), filename_pattern=kwd.get("filename_pattern"), ) - headers["Content-Disposition"] = ( - f"attachment; filename=\"{filename}\"; filename*=UTF-8''{utf8_encoded_filename}" - ) + headers["Content-Disposition"] = toContentDisposition(filename) return open(dataset.get_file_name(), mode="rb"), headers def to_archive(self, dataset: DatasetProtocol, name: str = "") -> Iterable: @@ -476,7 +474,7 @@ def _serve_file_download(self, headers, data, trans, to_ext, file_size, **kwd): return self._archive_composite_dataset(trans, data, headers, do_action=kwd.get("do_action", "zip")) else: headers["Content-Length"] = str(file_size) - filename, utf8_encoded_filename = self._download_filename( + filename = self._download_filename( data, to_ext, hdca=kwd.get("hdca"), @@ -486,9 +484,7 @@ def _serve_file_download(self, headers, data, trans, to_ext, file_size, **kwd): headers["content-type"] = ( "application/octet-stream" # force octet-stream so Safari doesn't append mime extensions to filename ) - headers["Content-Disposition"] = ( - f"attachment; filename=\"{filename}\"; filename*=UTF-8''{utf8_encoded_filename}" - ) + headers["Content-Disposition"] = toContentDisposition(filename) return open(data.get_file_name(), "rb"), headers def _serve_binary_file_contents_as_text(self, trans, data, headers, file_size, max_peek_size): @@ -664,17 +660,14 @@ def _download_filename( hdca: Optional[DatasetHasHidProtocol] = None, element_identifier: Optional[str] = None, filename_pattern: Optional[str] = None, - ) -> Tuple[str, str]: - def escape(raw_identifier): - return "".join(c in FILENAME_VALID_CHARS and c or "_" for c in raw_identifier)[0:150] - + ) -> str: if not to_ext or to_ext == "data": # If a client requests to_ext with the extension 'data', they are # deferring to the server, set it based on datatype. to_ext = dataset.extension template_values = { - "name": escape(dataset.name), + "name": dataset.name, "ext": to_ext, "hid": dataset.hid, } @@ -687,13 +680,12 @@ def escape(raw_identifier): if hdca is not None: # Use collection context to build up filename. - template_values["element_identifier"] = element_identifier - template_values["hdca_name"] = escape(hdca.name) + if element_identifier is not None: + template_values["element_identifier"] = element_identifier + template_values["hdca_name"] = hdca.name template_values["hdca_hid"] = hdca.hid - filename = string.Template(filename_pattern).substitute(**template_values) - template_values["name"] = quote(dataset.name, safe="") - utf8_encoded_filename = string.Template(filename_pattern).substitute(**template_values) - return filename, utf8_encoded_filename + + return string.Template(filename_pattern).substitute(**template_values) def display_name(self, dataset: HasName) -> str: """Returns formatted html of dataset name""" diff --git a/lib/galaxy/util/__init__.py b/lib/galaxy/util/__init__.py index 1b0ec302fb71..f0c1938f3021 100644 --- a/lib/galaxy/util/__init__.py +++ b/lib/galaxy/util/__init__.py @@ -49,6 +49,7 @@ Union, ) from urllib.parse import ( + quote, urlencode, urlparse, urlsplit, @@ -2006,3 +2007,8 @@ def lowercase_alphanum_to_hex(lowercase_alphanum: str) -> str: import numpy as np return np.base_repr(int(lowercase_alphanum, 36), 16).lower() + + +def toContentDisposition(filename: str) -> str: + utf8_encoded_filename = quote(filename, safe="") + return f"attachment; filename=\"{utf8_encoded_filename}\"; filename*=UTF-8''{utf8_encoded_filename}" diff --git a/lib/galaxy/util/zipstream.py b/lib/galaxy/util/zipstream.py index 3ab85ac19666..8a643d03d3e1 100644 --- a/lib/galaxy/util/zipstream.py +++ b/lib/galaxy/util/zipstream.py @@ -11,6 +11,7 @@ import zipstream +from galaxy.util import toContentDisposition from .path import safe_walk CRC32_MIN = 1444 @@ -41,11 +42,7 @@ def response(self) -> Iterator[bytes]: def get_headers(self) -> Dict[str, str]: headers = {} if self.archive_name: - archive_name = self.archive_name.encode("latin-1", "replace").decode("latin-1") - utf8_encoded_filename = quote(self.archive_name, safe="") - headers["Content-Disposition"] = ( - f"attachment; filename=\"{archive_name}.zip\"; filename*=UTF-8''{utf8_encoded_filename}.zip" - ) + headers["Content-Disposition"] = toContentDisposition(f"{self.archive_name}.zip") if self.upstream_mod_zip: headers["X-Archive-Files"] = "zip" else: From 1388ea151f8539e143c084dde8b668960ebde356 Mon Sep 17 00:00:00 2001 From: Arash Date: Wed, 20 Nov 2024 13:01:59 +0100 Subject: [PATCH 4/6] Sanitize filenames in toContentDisposition function to ensure valid characters are used --- lib/galaxy/util/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/util/__init__.py b/lib/galaxy/util/__init__.py index f0c1938f3021..e8ce53de1b61 100644 --- a/lib/galaxy/util/__init__.py +++ b/lib/galaxy/util/__init__.py @@ -2010,5 +2010,6 @@ def lowercase_alphanum_to_hex(lowercase_alphanum: str) -> str: def toContentDisposition(filename: str) -> str: + sanitized_filename = "".join(c in FILENAME_VALID_CHARS and c or "_" for c in filename)[0:150] utf8_encoded_filename = quote(filename, safe="") - return f"attachment; filename=\"{utf8_encoded_filename}\"; filename*=UTF-8''{utf8_encoded_filename}" + return f"attachment; filename=\"{sanitized_filename}\"; filename*=UTF-8''{utf8_encoded_filename}" From 77767b3dc19bcea31047236f36a203a0f8c82dc5 Mon Sep 17 00:00:00 2001 From: Arash Date: Wed, 20 Nov 2024 14:40:29 +0100 Subject: [PATCH 5/6] Rename toContentDisposition function to to_content_disposition apply 255 characters limit to filename exclude `/\?%*:|"<>` characters from encoded filename --- lib/galaxy/datatypes/data.py | 6 +++--- lib/galaxy/util/__init__.py | 7 ++++--- lib/galaxy/util/zipstream.py | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index 0677498f68c4..1cf74989976a 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -50,7 +50,7 @@ FILENAME_VALID_CHARS, inflector, iter_start_of_line, - toContentDisposition, + to_content_disposition, unicodify, UNKNOWN, ) @@ -438,7 +438,7 @@ def _serve_raw( element_identifier=kwd.get("element_identifier"), filename_pattern=kwd.get("filename_pattern"), ) - headers["Content-Disposition"] = toContentDisposition(filename) + headers["Content-Disposition"] = to_content_disposition(filename) return open(dataset.get_file_name(), mode="rb"), headers def to_archive(self, dataset: DatasetProtocol, name: str = "") -> Iterable: @@ -484,7 +484,7 @@ def _serve_file_download(self, headers, data, trans, to_ext, file_size, **kwd): headers["content-type"] = ( "application/octet-stream" # force octet-stream so Safari doesn't append mime extensions to filename ) - headers["Content-Disposition"] = toContentDisposition(filename) + headers["Content-Disposition"] = to_content_disposition(filename) return open(data.get_file_name(), "rb"), headers def _serve_binary_file_contents_as_text(self, trans, data, headers, file_size, max_peek_size): diff --git a/lib/galaxy/util/__init__.py b/lib/galaxy/util/__init__.py index e8ce53de1b61..9b3044fb429f 100644 --- a/lib/galaxy/util/__init__.py +++ b/lib/galaxy/util/__init__.py @@ -2009,7 +2009,8 @@ def lowercase_alphanum_to_hex(lowercase_alphanum: str) -> str: return np.base_repr(int(lowercase_alphanum, 36), 16).lower() -def toContentDisposition(filename: str) -> str: - sanitized_filename = "".join(c in FILENAME_VALID_CHARS and c or "_" for c in filename)[0:150] - utf8_encoded_filename = quote(filename, safe="") +def to_content_disposition(target: str) -> str: + filename, ext = os.path.splitext(target) + sanitized_filename = "".join(c in FILENAME_VALID_CHARS and c or "_" for c in filename)[0:255] + ext + utf8_encoded_filename = quote(re.sub(r'[\/\\\?%*:|"<>]', "_", filename), safe="")[0:255] + ext return f"attachment; filename=\"{sanitized_filename}\"; filename*=UTF-8''{utf8_encoded_filename}" diff --git a/lib/galaxy/util/zipstream.py b/lib/galaxy/util/zipstream.py index 8a643d03d3e1..7b81d1c33f4b 100644 --- a/lib/galaxy/util/zipstream.py +++ b/lib/galaxy/util/zipstream.py @@ -11,7 +11,7 @@ import zipstream -from galaxy.util import toContentDisposition +from galaxy.util import to_content_disposition from .path import safe_walk CRC32_MIN = 1444 @@ -42,7 +42,7 @@ def response(self) -> Iterator[bytes]: def get_headers(self) -> Dict[str, str]: headers = {} if self.archive_name: - headers["Content-Disposition"] = toContentDisposition(f"{self.archive_name}.zip") + headers["Content-Disposition"] = to_content_disposition(f"{self.archive_name}.zip") if self.upstream_mod_zip: headers["X-Archive-Files"] = "zip" else: From dfafd7a8beb2ebf68c29017d24eaf26279548f68 Mon Sep 17 00:00:00 2001 From: Arash Date: Wed, 20 Nov 2024 14:46:53 +0100 Subject: [PATCH 6/6] exclude ext from character_limit --- lib/galaxy/util/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/util/__init__.py b/lib/galaxy/util/__init__.py index 9b3044fb429f..d2c8617c08bd 100644 --- a/lib/galaxy/util/__init__.py +++ b/lib/galaxy/util/__init__.py @@ -2011,6 +2011,7 @@ def lowercase_alphanum_to_hex(lowercase_alphanum: str) -> str: def to_content_disposition(target: str) -> str: filename, ext = os.path.splitext(target) - sanitized_filename = "".join(c in FILENAME_VALID_CHARS and c or "_" for c in filename)[0:255] + ext - utf8_encoded_filename = quote(re.sub(r'[\/\\\?%*:|"<>]', "_", filename), safe="")[0:255] + ext + character_limit = 255 - len(ext) + sanitized_filename = "".join(c in FILENAME_VALID_CHARS and c or "_" for c in filename)[0:character_limit] + ext + utf8_encoded_filename = quote(re.sub(r'[\/\\\?%*:|"<>]', "_", filename), safe="")[0:character_limit] + ext return f"attachment; filename=\"{sanitized_filename}\"; filename*=UTF-8''{utf8_encoded_filename}"