From 2a083faf75b94169efe10a60487f4aa990037a92 Mon Sep 17 00:00:00 2001 From: Dannon Baker Date: Mon, 22 Apr 2024 17:32:26 -0400 Subject: [PATCH 01/12] Adds logging of messageExceptions in the fastapi exception handler. We're intentionally not doing a full log.exception with the traceback here, though we could. This exception will also be dispatched to Sentry if configured, this just makes logs less opaque when one sees a 500. --- lib/galaxy/webapps/base/api.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/galaxy/webapps/base/api.py b/lib/galaxy/webapps/base/api.py index df779127f0c3..725d30000b80 100644 --- a/lib/galaxy/webapps/base/api.py +++ b/lib/galaxy/webapps/base/api.py @@ -198,6 +198,10 @@ async def validate_exception_middleware(request: Request, exc: RequestValidation @app.exception_handler(MessageException) async def message_exception_middleware(request: Request, exc: MessageException) -> Response: + # Intentionally not logging traceback here as the full context will be + # dispatched to Sentry if configured. This just makes logs less opaque + # when one sees a 500. + log.error(f"MessageException: {exc}") return get_error_response_for_request(request, exc) From 114803bc9d8ca0947ed4e15b3da05f4fd846b5c0 Mon Sep 17 00:00:00 2001 From: Dannon Baker Date: Mon, 22 Apr 2024 21:32:31 -0400 Subject: [PATCH 02/12] Force MessageExceptions used in workflow modules to have a string error message. --- lib/galaxy/workflow/modules.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/workflow/modules.py b/lib/galaxy/workflow/modules.py index ba0692c77d52..6113473c95f4 100644 --- a/lib/galaxy/workflow/modules.py +++ b/lib/galaxy/workflow/modules.py @@ -2557,11 +2557,13 @@ def populate_module_and_state( step_args = param_map.get(step.id, {}) step_errors = module_injector.compute_runtime_state(step, step_args=step_args) if step_errors: - raise exceptions.MessageException(step_errors, err_data={step.order_index: step_errors}) + raise exceptions.MessageException( + "Error computing workflow step runtime state", err_data={step.order_index: step_errors} + ) if step.upgrade_messages: if allow_tool_state_corrections: log.debug('Workflow step "%i" had upgrade messages: %s', step.id, step.upgrade_messages) else: raise exceptions.MessageException( - step.upgrade_messages, err_data={step.order_index: step.upgrade_messages} + "Workflow step has upgrade messages", err_data={step.order_index: step.upgrade_messages} ) From 28f0fc843f1d6773085ac4c1494b5bbbcf7a460e Mon Sep 17 00:00:00 2001 From: Dannon Baker Date: Tue, 23 Apr 2024 08:52:16 -0400 Subject: [PATCH 03/12] Chage MessageException typing to only accept str as err_msg. --- lib/galaxy/exceptions/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/galaxy/exceptions/__init__.py b/lib/galaxy/exceptions/__init__.py index 5806ed157c21..88513f318a8e 100644 --- a/lib/galaxy/exceptions/__init__.py +++ b/lib/galaxy/exceptions/__init__.py @@ -30,7 +30,7 @@ class MessageException(Exception): # Error code information embedded into API json responses. err_code: ErrorCode = error_codes_by_name["UNKNOWN"] - def __init__(self, err_msg=None, type="info", **extra_error_info): + def __init__(self, err_msg: str = "", type="info", **extra_error_info): self.err_msg = err_msg or self.err_code.default_error_message self.type = type self.extra_error_info = extra_error_info @@ -64,7 +64,7 @@ class AcceptedRetryLater(MessageException): err_code = error_codes_by_name["ACCEPTED_RETRY_LATER"] retry_after: int - def __init__(self, msg, retry_after=60): + def __init__(self, msg: str, retry_after=60): super().__init__(msg) self.retry_after = retry_after @@ -136,7 +136,7 @@ class ToolMissingException(MessageException): status_code = 400 err_code = error_codes_by_name["USER_TOOL_MISSING_PROBLEM"] - def __init__(self, err_msg=None, type="info", tool_id=None, **extra_error_info): + def __init__(self, err_msg: str = "", type="info", tool_id=None, **extra_error_info): super().__init__(err_msg, type, **extra_error_info) self.tool_id = tool_id @@ -152,7 +152,7 @@ class ToolInputsNotReadyException(MessageException): class ToolInputsNotOKException(MessageException): - def __init__(self, err_msg=None, type="info", *, src: str, id: int, **extra_error_info): + def __init__(self, err_msg: str = "", type="info", *, src: str, id: int, **extra_error_info): super().__init__(err_msg, type, **extra_error_info) self.src = src self.id = id From 52a7a144809efed79ec8f32bf100865f640a5dac Mon Sep 17 00:00:00 2001 From: Dannon Baker Date: Tue, 23 Apr 2024 13:39:15 -0400 Subject: [PATCH 04/12] Downgrade message to log.info to avoid sending redundantly to sentry --- lib/galaxy/webapps/base/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/webapps/base/api.py b/lib/galaxy/webapps/base/api.py index 725d30000b80..22793c85550b 100644 --- a/lib/galaxy/webapps/base/api.py +++ b/lib/galaxy/webapps/base/api.py @@ -201,7 +201,7 @@ async def message_exception_middleware(request: Request, exc: MessageException) # Intentionally not logging traceback here as the full context will be # dispatched to Sentry if configured. This just makes logs less opaque # when one sees a 500. - log.error(f"MessageException: {exc}") + log.info(f"MessageException: {exc}") return get_error_response_for_request(request, exc) From 1c1317be24ec24c2395986820592c186b2467bd9 Mon Sep 17 00:00:00 2001 From: Dannon Baker Date: Tue, 23 Apr 2024 13:41:43 -0400 Subject: [PATCH 05/12] Only log if it's a 500 --- lib/galaxy/webapps/base/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/webapps/base/api.py b/lib/galaxy/webapps/base/api.py index 22793c85550b..3da3ba3d3900 100644 --- a/lib/galaxy/webapps/base/api.py +++ b/lib/galaxy/webapps/base/api.py @@ -201,7 +201,8 @@ async def message_exception_middleware(request: Request, exc: MessageException) # Intentionally not logging traceback here as the full context will be # dispatched to Sentry if configured. This just makes logs less opaque # when one sees a 500. - log.info(f"MessageException: {exc}") + if exc.status_code >= 500: + log.info(f"MessageException: {exc}") return get_error_response_for_request(request, exc) From e2798c9ca8c435850e47b773fb9aa90e7827635e Mon Sep 17 00:00:00 2001 From: Dannon Date: Wed, 24 Apr 2024 11:33:31 -0400 Subject: [PATCH 06/12] Apply suggestions from code review Co-authored-by: Marius van den Beek --- lib/galaxy/exceptions/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/galaxy/exceptions/__init__.py b/lib/galaxy/exceptions/__init__.py index 88513f318a8e..d167a2e1fd00 100644 --- a/lib/galaxy/exceptions/__init__.py +++ b/lib/galaxy/exceptions/__init__.py @@ -30,7 +30,7 @@ class MessageException(Exception): # Error code information embedded into API json responses. err_code: ErrorCode = error_codes_by_name["UNKNOWN"] - def __init__(self, err_msg: str = "", type="info", **extra_error_info): + def __init__(self, err_msg: Optional[str] = None, type="info", **extra_error_info): self.err_msg = err_msg or self.err_code.default_error_message self.type = type self.extra_error_info = extra_error_info @@ -64,7 +64,7 @@ class AcceptedRetryLater(MessageException): err_code = error_codes_by_name["ACCEPTED_RETRY_LATER"] retry_after: int - def __init__(self, msg: str, retry_after=60): + def __init__(self, msg: Optional[str] = None, retry_after=60): super().__init__(msg) self.retry_after = retry_after @@ -136,7 +136,7 @@ class ToolMissingException(MessageException): status_code = 400 err_code = error_codes_by_name["USER_TOOL_MISSING_PROBLEM"] - def __init__(self, err_msg: str = "", type="info", tool_id=None, **extra_error_info): + def __init__(self, err_msg: Optional[str] = None, type="info", tool_id=None, **extra_error_info): super().__init__(err_msg, type, **extra_error_info) self.tool_id = tool_id @@ -152,7 +152,7 @@ class ToolInputsNotReadyException(MessageException): class ToolInputsNotOKException(MessageException): - def __init__(self, err_msg: str = "", type="info", *, src: str, id: int, **extra_error_info): + def __init__(self, err_msg: Optional[str] = None, type="info", *, src: str, id: int, **extra_error_info): super().__init__(err_msg, type, **extra_error_info) self.src = src self.id = id From e5c59e1e5eb6d656e25a1f072a5fc48d7f5507e2 Mon Sep 17 00:00:00 2001 From: Dannon Baker Date: Wed, 24 Apr 2024 11:34:52 -0400 Subject: [PATCH 07/12] Fix imports after suggestions from code review. --- lib/galaxy/exceptions/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/galaxy/exceptions/__init__.py b/lib/galaxy/exceptions/__init__.py index d167a2e1fd00..8deab77bc17b 100644 --- a/lib/galaxy/exceptions/__init__.py +++ b/lib/galaxy/exceptions/__init__.py @@ -16,6 +16,8 @@ and messages. """ +from typing import Optional + from .error_codes import ( error_codes_by_name, ErrorCode, From acec4f6de36f08158ad36e63cd220f880fe4e675 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Tue, 23 Apr 2024 11:54:47 +0200 Subject: [PATCH 08/12] Improve error message for __EXTRACT_DATASETS__ tool Provides a reasonable error message to users, instead of the internal error https://sentry.galaxyproject.org/share/issue/a197159192b64b9db065a762ee5dbbfc/: ``` KeyError: 'Dataset collection has no element_identifier with key 2.' File "galaxy/tools/__init__.py", line 1972, in handle_single_execution rval = self.execute( File "galaxy/tools/__init__.py", line 2069, in execute return self.tool_action.execute( File "galaxy/tools/actions/model_operations.py", line 88, in execute self._produce_outputs( File "galaxy/tools/actions/model_operations.py", line 119, in _produce_outputs tool.produce_outputs( File "galaxy/tools/__init__.py", line 3351, in produce_outputs extracted_element = collection[incoming["which"]["identifier"]] File "galaxy/model/__init__.py", line 6434, in __getitem__ raise KeyError(error_message) ``` Also makes the `identifier` parameter explicitly required (it is now inferred to be optional because text parameters are by default optional if no validator raises an exception upon validating an empty string). That prevents ``` KeyError: 'Dataset collection has no element_identifier with key None.' File "galaxy/tools/__init__.py", line 1972, in handle_single_execution rval = self.execute( File "galaxy/tools/__init__.py", line 2069, in execute return self.tool_action.execute( File "galaxy/tools/actions/model_operations.py", line 88, in execute self._produce_outputs( File "galaxy/tools/actions/model_operations.py", line 119, in _produce_outputs tool.produce_outputs( File "galaxy/tools/__init__.py", line 3351, in produce_outputs extracted_element = collection[incoming["which"]["identifier"]] File "galaxy/model/__init__.py", line 6434, in __getitem__ raise KeyError(error_message) ``` --- lib/galaxy/tools/__init__.py | 10 ++++++++-- lib/galaxy/tools/extract_dataset.xml | 6 +++--- lib/galaxy_test/api/test_tools.py | 23 +++++++++++++++++++++-- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index 57aa93d8000e..20d7e4f47608 100644 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -3356,9 +3356,15 @@ def produce_outputs(self, trans, out_data, output_collections, incoming, history if how == "first": extracted_element = collection.first_dataset_element elif how == "by_identifier": - extracted_element = collection[incoming["which"]["identifier"]] + try: + extracted_element = collection[incoming["which"]["identifier"]] + except KeyError as e: + raise exceptions.MessageException(e.args[0]) elif how == "by_index": - extracted_element = collection[int(incoming["which"]["index"])] + try: + extracted_element = collection[int(incoming["which"]["index"])] + except KeyError as e: + raise exceptions.MessageException(e.args[0]) else: raise exceptions.MessageException("Invalid tool parameters.") extracted = extracted_element.element_object diff --git a/lib/galaxy/tools/extract_dataset.xml b/lib/galaxy/tools/extract_dataset.xml index 3e066ebebec5..37986eb722f9 100644 --- a/lib/galaxy/tools/extract_dataset.xml +++ b/lib/galaxy/tools/extract_dataset.xml @@ -19,7 +19,7 @@ - + @@ -52,9 +52,9 @@ Description The tool allow extracting datasets based on position (**The first dataset** and **Select by index** options) or name (**Select by element identifier** option). This tool effectively collapses the inner-most collection into a dataset. For nested collections (e.g a list of lists of lists: outer:middle:inner, extracting the inner dataset element) a new list is created where the selected element takes the position of the inner-most collection (so outer:middle, where middle is not a collection but the inner dataset element). -.. class:: warningmark +.. class:: warningmark -**Note**: Dataset index (numbering) begins with 0 (zero). +**Note**: Dataset index (numbering) begins with 0 (zero). .. class:: infomark diff --git a/lib/galaxy_test/api/test_tools.py b/lib/galaxy_test/api/test_tools.py index 7befe7f221b1..3e99d0115ac1 100644 --- a/lib/galaxy_test/api/test_tools.py +++ b/lib/galaxy_test/api/test_tools.py @@ -701,7 +701,7 @@ def test_database_operation_tool_with_pending_inputs(self): hdca1_id = self.dataset_collection_populator.create_list_in_history( history_id, contents=["a\nb\nc\nd", "e\nf\ng\nh"], wait=True ).json()["outputs"][0]["id"] - self.dataset_populator.run_tool( + run_response = self.dataset_populator.run_tool( tool_id="cat_data_and_sleep", inputs={ "sleep_time": 15, @@ -709,15 +709,34 @@ def test_database_operation_tool_with_pending_inputs(self): }, history_id=history_id, ) + output_hdca_id = run_response["implicit_collections"][0]["id"] run_response = self.dataset_populator.run_tool( tool_id="__EXTRACT_DATASET__", inputs={ - "data_collection": {"src": "hdca", "id": hdca1_id}, + "data_collection": {"src": "hdca", "id": output_hdca_id}, }, history_id=history_id, ) assert run_response["outputs"][0]["state"] != "ok" + @skip_without_tool("__EXTRACT_DATASET__") + def test_extract_dataset_invalid_element_identifier(self): + with self.dataset_populator.test_history(require_new=False) as history_id: + hdca1_id = self.dataset_collection_populator.create_list_in_history( + history_id, contents=["a\nb\nc\nd", "e\nf\ng\nh"], wait=True + ).json()["outputs"][0]["id"] + run_response = self.dataset_populator.run_tool_raw( + tool_id="__EXTRACT_DATASET__", + inputs={ + "data_collection": {"src": "hdca", "id": hdca1_id}, + "which": {"which_dataset": "by_index", "index": 100}, + }, + history_id=history_id, + input_format="21.01", + ) + assert run_response.status_code == 400 + assert run_response.json()["err_msg"] == "Dataset collection has no element_index with key 100." + @skip_without_tool("__FILTER_FAILED_DATASETS__") def test_filter_failed_list(self): with self.dataset_populator.test_history(require_new=False) as history_id: From ab868b3e76ed750acab337d83969d9c4039fed4f Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Thu, 25 Apr 2024 11:23:26 +0200 Subject: [PATCH 09/12] Use dataset's encoded_id instead of hid for export filenames --- lib/galaxy/model/store/__init__.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/lib/galaxy/model/store/__init__.py b/lib/galaxy/model/store/__init__.py index 1a38ebfd4ae4..1c8b7eb7b740 100644 --- a/lib/galaxy/model/store/__init__.py +++ b/lib/galaxy/model/store/__init__.py @@ -1961,8 +1961,6 @@ def add(src, dest): dir_name = "datasets" dir_path = os.path.join(export_directory, dir_name) - dataset_hid = as_dict["hid"] - assert dataset_hid, as_dict if dataset.dataset.id in self.dataset_id_to_path: file_name, extra_files_path = self.dataset_id_to_path[dataset.dataset.id] @@ -1981,7 +1979,7 @@ def add(src, dest): self.serialization_options.get_identifier(self.security, conversion) if conversion else None ) target_filename = get_export_dataset_filename( - as_dict["name"], as_dict["extension"], dataset_hid, conversion_key=conversion_key + as_dict["name"], as_dict["extension"], as_dict["encoded_id"], conversion_key=conversion_key ) arcname = os.path.join(dir_name, target_filename) src = file_name @@ -1997,7 +1995,7 @@ def add(src, dest): if len(file_list): extra_files_target_filename = get_export_dataset_extra_files_dir_name( - as_dict["name"], as_dict["extension"], dataset_hid, conversion_key=conversion_key + as_dict["name"], as_dict["extension"], as_dict["encoded_id"], conversion_key=conversion_key ) arcname = os.path.join(dir_name, extra_files_target_filename) add(extra_files_path, os.path.join(export_directory, arcname)) @@ -2967,22 +2965,22 @@ def tar_export_directory(export_directory: StrPath, out_file: StrPath, gzip: boo store_archive.add(os.path.join(export_directory, export_path), arcname=export_path) -def get_export_dataset_filename(name: str, ext: str, hid: int, conversion_key: Optional[str]) -> str: +def get_export_dataset_filename(name: str, ext: str, encoded_id: str, conversion_key: Optional[str]) -> str: """ Builds a filename for a dataset using its name an extension. """ base = "".join(c in FILENAME_VALID_CHARS and c or "_" for c in name) if not conversion_key: - return f"{base}_{hid}.{ext}" + return f"{base}_{encoded_id}.{ext}" else: - return f"{base}_{hid}_conversion_{conversion_key}.{ext}" + return f"{base}_{encoded_id}_conversion_{conversion_key}.{ext}" -def get_export_dataset_extra_files_dir_name(name: str, ext: str, hid: int, conversion_key: Optional[str]) -> str: +def get_export_dataset_extra_files_dir_name(name: str, ext: str, encoded_id: str, conversion_key: Optional[str]) -> str: if not conversion_key: - return f"extra_files_path_{hid}" + return f"extra_files_path_{encoded_id}" else: - return f"extra_files_path_{hid}_conversion_{conversion_key}" + return f"extra_files_path_{encoded_id}_conversion_{conversion_key}" def imported_store_for_metadata( From 78bf048b3a7f53b4ae56f4a88a0a1eaacb1b79f6 Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Thu, 25 Apr 2024 11:24:56 +0200 Subject: [PATCH 10/12] Remove unused parameters in get_export_dataset_extra_files_dir_name --- lib/galaxy/model/store/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/model/store/__init__.py b/lib/galaxy/model/store/__init__.py index 1c8b7eb7b740..a855633fe81b 100644 --- a/lib/galaxy/model/store/__init__.py +++ b/lib/galaxy/model/store/__init__.py @@ -1995,7 +1995,7 @@ def add(src, dest): if len(file_list): extra_files_target_filename = get_export_dataset_extra_files_dir_name( - as_dict["name"], as_dict["extension"], as_dict["encoded_id"], conversion_key=conversion_key + as_dict["encoded_id"], conversion_key=conversion_key ) arcname = os.path.join(dir_name, extra_files_target_filename) add(extra_files_path, os.path.join(export_directory, arcname)) @@ -2976,7 +2976,7 @@ def get_export_dataset_filename(name: str, ext: str, encoded_id: str, conversion return f"{base}_{encoded_id}_conversion_{conversion_key}.{ext}" -def get_export_dataset_extra_files_dir_name(name: str, ext: str, encoded_id: str, conversion_key: Optional[str]) -> str: +def get_export_dataset_extra_files_dir_name(encoded_id: str, conversion_key: Optional[str]) -> str: if not conversion_key: return f"extra_files_path_{encoded_id}" else: From 419e4954f760e38b99715c353c4c3b346b105c25 Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Thu, 25 Apr 2024 12:06:28 +0200 Subject: [PATCH 11/12] Add test to verify export dataset without hid works --- test/unit/data/model/test_model_store.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/unit/data/model/test_model_store.py b/test/unit/data/model/test_model_store.py index b2a3b0739103..f7c4ea1ce140 100644 --- a/test/unit/data/model/test_model_store.py +++ b/test/unit/data/model/test_model_store.py @@ -544,6 +544,21 @@ def validate_invocation_collection_crate_directory(crate_directory): assert dataset in root["hasPart"] +def test_export_history_with_missing_hid(): + # The dataset's hid was used to compose the file name during the export but it + # can be missing sometimes. We now use the dataset's encoded id instead. + app = _mock_app() + u, history, d1, d2, j = _setup_simple_cat_job(app) + + # Remove hid from d1 + d1.hid = None + app.commit() + + temp_directory = mkdtemp() + with store.DirectoryModelExportStore(temp_directory, app=app, export_files="copy") as export_store: + export_store.export_history(history) + + def test_export_history_to_ro_crate(tmp_path): app = _mock_app() u, history, d1, d2, j = _setup_simple_cat_job(app) From 1bea8a71f7f3aabb256ed4654b9f345fdcc4bc3d Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 2 May 2024 17:31:21 +0200 Subject: [PATCH 12/12] Cancel ProcessPoolFuture when aborting job Fixes the occasional timeout error in jobs that follow the `test_abort_fetch_job` test. --- lib/galaxy/celery/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/galaxy/celery/tasks.py b/lib/galaxy/celery/tasks.py index fc60f6921327..657ae3ef909b 100644 --- a/lib/galaxy/celery/tasks.py +++ b/lib/galaxy/celery/tasks.py @@ -288,6 +288,7 @@ def abort_when_job_stops(function: Callable, session: galaxy_scoped_session, job return future.result(timeout=1) except TimeoutError: if is_aborted(session, job_id): + future.cancel() return