From 18a0b927333b42a882d2bea37682f8e6e188a071 Mon Sep 17 00:00:00 2001 From: Arash Date: Wed, 20 Nov 2024 11:11:00 +0100 Subject: [PATCH] Refactor filename handling in data download to use a dedicated toContentDisposition function for improved UTF-8 support --- lib/galaxy/datatypes/data.py | 32 ++++++++++++-------------------- lib/galaxy/util/__init__.py | 6 ++++++ lib/galaxy/util/zipstream.py | 7 ++----- 3 files changed, 20 insertions(+), 25 deletions(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index a88f0d229896..0677498f68c4 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -19,7 +19,6 @@ TYPE_CHECKING, Union, ) -from urllib.parse import quote from markupsafe import escape from typing_extensions import Literal @@ -51,6 +50,7 @@ FILENAME_VALID_CHARS, inflector, iter_start_of_line, + toContentDisposition, unicodify, UNKNOWN, ) @@ -431,16 +431,14 @@ def _serve_raw( headers["content-type"] = ( "application/octet-stream" # force octet-stream so Safari doesn't append mime extensions to filename ) - filename, utf8_encoded_filename = self._download_filename( + filename = self._download_filename( dataset, to_ext, hdca=kwd.get("hdca"), element_identifier=kwd.get("element_identifier"), filename_pattern=kwd.get("filename_pattern"), ) - headers["Content-Disposition"] = ( - f"attachment; filename=\"{filename}\"; filename*=UTF-8''{utf8_encoded_filename}" - ) + headers["Content-Disposition"] = toContentDisposition(filename) return open(dataset.get_file_name(), mode="rb"), headers def to_archive(self, dataset: DatasetProtocol, name: str = "") -> Iterable: @@ -476,7 +474,7 @@ def _serve_file_download(self, headers, data, trans, to_ext, file_size, **kwd): return self._archive_composite_dataset(trans, data, headers, do_action=kwd.get("do_action", "zip")) else: headers["Content-Length"] = str(file_size) - filename, utf8_encoded_filename = self._download_filename( + filename = self._download_filename( data, to_ext, hdca=kwd.get("hdca"), @@ -486,9 +484,7 @@ def _serve_file_download(self, headers, data, trans, to_ext, file_size, **kwd): headers["content-type"] = ( "application/octet-stream" # force octet-stream so Safari doesn't append mime extensions to filename ) - headers["Content-Disposition"] = ( - f"attachment; filename=\"{filename}\"; filename*=UTF-8''{utf8_encoded_filename}" - ) + headers["Content-Disposition"] = toContentDisposition(filename) return open(data.get_file_name(), "rb"), headers def _serve_binary_file_contents_as_text(self, trans, data, headers, file_size, max_peek_size): @@ -664,17 +660,14 @@ def _download_filename( hdca: Optional[DatasetHasHidProtocol] = None, element_identifier: Optional[str] = None, filename_pattern: Optional[str] = None, - ) -> Tuple[str, str]: - def escape(raw_identifier): - return "".join(c in FILENAME_VALID_CHARS and c or "_" for c in raw_identifier)[0:150] - + ) -> str: if not to_ext or to_ext == "data": # If a client requests to_ext with the extension 'data', they are # deferring to the server, set it based on datatype. to_ext = dataset.extension template_values = { - "name": escape(dataset.name), + "name": dataset.name, "ext": to_ext, "hid": dataset.hid, } @@ -687,13 +680,12 @@ def escape(raw_identifier): if hdca is not None: # Use collection context to build up filename. - template_values["element_identifier"] = element_identifier - template_values["hdca_name"] = escape(hdca.name) + if element_identifier is not None: + template_values["element_identifier"] = element_identifier + template_values["hdca_name"] = hdca.name template_values["hdca_hid"] = hdca.hid - filename = string.Template(filename_pattern).substitute(**template_values) - template_values["name"] = quote(dataset.name, safe="") - utf8_encoded_filename = string.Template(filename_pattern).substitute(**template_values) - return filename, utf8_encoded_filename + + return string.Template(filename_pattern).substitute(**template_values) def display_name(self, dataset: HasName) -> str: """Returns formatted html of dataset name""" diff --git a/lib/galaxy/util/__init__.py b/lib/galaxy/util/__init__.py index 1b0ec302fb71..f0c1938f3021 100644 --- a/lib/galaxy/util/__init__.py +++ b/lib/galaxy/util/__init__.py @@ -49,6 +49,7 @@ Union, ) from urllib.parse import ( + quote, urlencode, urlparse, urlsplit, @@ -2006,3 +2007,8 @@ def lowercase_alphanum_to_hex(lowercase_alphanum: str) -> str: import numpy as np return np.base_repr(int(lowercase_alphanum, 36), 16).lower() + + +def toContentDisposition(filename: str) -> str: + utf8_encoded_filename = quote(filename, safe="") + return f"attachment; filename=\"{utf8_encoded_filename}\"; filename*=UTF-8''{utf8_encoded_filename}" diff --git a/lib/galaxy/util/zipstream.py b/lib/galaxy/util/zipstream.py index 3ab85ac19666..8a643d03d3e1 100644 --- a/lib/galaxy/util/zipstream.py +++ b/lib/galaxy/util/zipstream.py @@ -11,6 +11,7 @@ import zipstream +from galaxy.util import toContentDisposition from .path import safe_walk CRC32_MIN = 1444 @@ -41,11 +42,7 @@ def response(self) -> Iterator[bytes]: def get_headers(self) -> Dict[str, str]: headers = {} if self.archive_name: - archive_name = self.archive_name.encode("latin-1", "replace").decode("latin-1") - utf8_encoded_filename = quote(self.archive_name, safe="") - headers["Content-Disposition"] = ( - f"attachment; filename=\"{archive_name}.zip\"; filename*=UTF-8''{utf8_encoded_filename}.zip" - ) + headers["Content-Disposition"] = toContentDisposition(f"{self.archive_name}.zip") if self.upstream_mod_zip: headers["X-Archive-Files"] = "zip" else: