From 30760746af9ad9f401216a2d7c78d8452bcccbec Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Sat, 1 Apr 2023 02:59:57 -0400 Subject: [PATCH 1/5] security fixes and improvements to handle file paths --- CHANGES.rst | 5 +- requirements-dev.txt | 1 + tests/functional/test_builtin.py | 50 ++++++++++++++--- tests/functional/test_cli.py | 4 +- tests/functional/test_workflow.py | 16 ++++-- tests/functional/utils.py | 2 +- tests/test_utils.py | 15 ++++++ tests/utils.py | 25 +++++++++ weaver/cli.py | 2 +- weaver/processes/builtin/__init__.py | 10 +++- weaver/processes/builtin/file2string_array.py | 17 ++++-- .../processes/builtin/file_index_selector.py | 17 ++++-- weaver/processes/builtin/jsonarray2netcdf.py | 35 +++++++----- weaver/processes/builtin/metalink2netcdf.cwl | 3 ++ weaver/processes/builtin/metalink2netcdf.py | 24 +++++---- weaver/processes/builtin/utils.py | 34 ++++++++++-- weaver/utils.py | 54 ++++++++++++++----- weaver/vault/utils.py | 9 ++-- weaver/vault/views.py | 2 +- weaver/wps_restapi/jobs/utils.py | 2 + 20 files changed, 260 insertions(+), 67 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index a81358e2b..c6d388e26 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -12,7 +12,10 @@ Changes Changes: -------- -- No change. +- Disallow ``builtin`` `Processes` expecting a user-provided input path to run with local file references such that + they must respect any configured server-side remote file access rules instead of bypassing security validations + through resolved local paths. +- Add multiple validation checks for more secure file paths handling when retrieving contents from remote locations. Fixes: ------ diff --git a/requirements-dev.txt b/requirements-dev.txt index 6ef229beb..4ec0c7e43 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -22,6 +22,7 @@ parameterized pluggy>=0.7 # FIXME: bad interpolation of 'setup.cfg' for pytest 'log_format' (https://github.com/pytest-dev/pytest/issues/10019) pytest<7 +pytest-server-fixtures pydocstyle pylint>=2.11,!=2.12,<2.14; python_version <= "3.6" pylint>=2.15.4; python_version >= "3.7" diff --git a/tests/functional/test_builtin.py b/tests/functional/test_builtin.py index 2e4067473..696ae2f66 100644 --- a/tests/functional/test_builtin.py +++ b/tests/functional/test_builtin.py @@ -7,7 +7,7 @@ import pytest from tests.functional.utils import WpsConfigBase -from tests.utils import get_settings_from_testapp, mocked_execute_celery, mocked_sub_requests +from tests.utils import FileServer, get_settings_from_testapp, mocked_execute_celery, mocked_sub_requests from weaver.execute import ExecuteControlOption, ExecuteMode, ExecuteResponse, ExecuteTransmissionMode from weaver.formats import ContentType from weaver.processes.builtin import register_builtin_processes @@ -32,6 +32,13 @@ def setUpClass(cls): cls.json_headers = {"Accept": ContentType.APP_JSON, "Content-Type": ContentType.APP_JSON} super(BuiltinAppTest, cls).setUpClass() + cls.file_server = FileServer() + cls.file_server.start() + + @classmethod + def tearDownClass(cls): + cls.file_server.teardown() + def setUp(self): # register builtin processes from scratch to have clean state self.process_store.clear_processes() @@ -86,18 +93,23 @@ def test_jsonarray2netcdf_describe_ogc_schema(self): assert body["jobControlOptions"] == [ExecuteControlOption.ASYNC, ExecuteControlOption.SYNC] assert body["outputTransmission"] == [ExecuteTransmissionMode.REFERENCE, ExecuteTransmissionMode.VALUE] - def setup_inputs(self, stack): - dirname = tempfile.gettempdir() + def setup_inputs(self, stack, use_temp_file=False): + if use_temp_file: + dir_path = tempfile.gettempdir() + url_path = f"file://{dir_path}" + else: + dir_path = self.file_server.document_root + url_path = self.file_server.uri nc_data = "Hello NetCDF!" - tmp_ncdf = tempfile.NamedTemporaryFile(dir=dirname, mode="w", suffix=".nc") # pylint: disable=R1732 - tmp_json = tempfile.NamedTemporaryFile(dir=dirname, mode="w", suffix=".json") # pylint: disable=R1732 + tmp_ncdf = tempfile.NamedTemporaryFile(dir=dir_path, mode="w", suffix=".nc") # pylint: disable=R1732 + tmp_json = tempfile.NamedTemporaryFile(dir=dir_path, mode="w", suffix=".json") # pylint: disable=R1732 tmp_ncdf = stack.enter_context(tmp_ncdf) # noqa tmp_json = stack.enter_context(tmp_json) # noqa tmp_ncdf.write(nc_data) tmp_ncdf.seek(0) - tmp_json.write(json.dumps([f"file://{os.path.join(dirname, tmp_ncdf.name)}"])) + tmp_json.write(json.dumps([f"{url_path}/{os.path.basename(tmp_ncdf.name)}"])) tmp_json.seek(0) - body = {"inputs": [{"id": "input", "href": os.path.join(dirname, tmp_json.name)}]} + body = {"inputs": [{"id": "input", "href": f"{url_path}/{os.path.basename(tmp_json.name)}"}]} return body, nc_data def validate_results(self, results, outputs, data, links): @@ -147,6 +159,30 @@ def validate_results(self, results, outputs, data, links): nc_real_path = nc_href.replace(wps_out, wps_dir) assert os.path.split(nc_real_path)[-1] == os.path.split(nc_href)[-1] + def test_jsonarray2netcdf_execute_invalid_file_local(self): + """ + Validate that local file path as input is not permitted anymore. + """ + with contextlib.ExitStack() as stack_exec: + body, nc_data = self.setup_inputs(stack_exec, use_temp_file=True) + body.update({ + "mode": ExecuteMode.ASYNC, + "response": ExecuteResponse.DOCUMENT, + "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.VALUE}], + }) + for mock_exec in mocked_execute_celery(): + stack_exec.enter_context(mock_exec) + path = "/processes/jsonarray2netcdf/jobs" + resp = mocked_sub_requests(self.app, "post_json", path, + data=body, headers=self.json_headers, only_local=True) + assert resp.status_code == 201 + + job_url = resp.json["location"] + job_res = self.monitor_job(job_url, expect_failed=True) + assert job_res["status"] == Status.FAILED + job_logs = self.app.get(f"{job_url}/logs").json + assert any("ValueError: Not a valid file URL reference" in log for log in job_logs) + def test_jsonarray2netcdf_execute_async(self): with contextlib.ExitStack() as stack_exec: body, nc_data = self.setup_inputs(stack_exec) diff --git a/tests/functional/test_cli.py b/tests/functional/test_cli.py index a80df5cd6..5f345396a 100644 --- a/tests/functional/test_cli.py +++ b/tests/functional/test_cli.py @@ -459,7 +459,7 @@ def test_execute_manual_monitor_status_and_download_results(self): Test a typical case of :term:`Job` execution, result retrieval and download, but with manual monitoring. Manual monitoring can be valid in cases where a *very* long :term:`Job` must be executed, and the user does - not intend to wait after it. This avoids leaving some shell/notebook/etc. open of a long time and provide a + not intend to wait for it. This avoids leaving some shell/notebook/etc. open of a long time and provide a massive ``timeout`` value. Instead, the user can simply re-call :meth:`WeaverClient.monitor` at a later time to resume monitoring. Other situation can be if the connection was dropped or script runner crashed, and the want to pick up monitoring again. @@ -480,7 +480,7 @@ def test_execute_manual_monitor_status_and_download_results(self): assert result.body.get("status") == Status.SUCCEEDED links = result.body.get("links") assert isinstance(links, list) - assert len(list(filter(lambda _link: _link["rel"].endswith("results"), links))) == 1 + assert len([_link for _link in links if _link["rel"].endswith("results")]) == 1 # first test to get job results details, but not downloading yet result = mocked_sub_requests(self.app, self.client.results, job_id) diff --git a/tests/functional/test_workflow.py b/tests/functional/test_workflow.py index 453ff5bfd..a73a9bfe7 100644 --- a/tests/functional/test_workflow.py +++ b/tests/functional/test_workflow.py @@ -15,6 +15,7 @@ import mock import pytest +from pytest_server_fixtures.http import SimpleHTTPTestServer from pyramid import testing from pyramid.httpexceptions import HTTPConflict, HTTPCreated, HTTPNotFound, HTTPOk from pyramid.settings import asbool @@ -992,14 +993,21 @@ def test_workflow_mixed_rest_builtin_wps1_docker_select_requirements(self): with contextlib.ExitStack() as stack: tmp_host = "https://mocked-file-server.com" # must match in 'Execute_WorkflowSelectCopyNestedOutDir.json' - tmp_dir = stack.enter_context(tempfile.TemporaryDirectory()) + # jsonarray2netcdf cannot use local paths anymore for nested NetCDF, provide them through tmp file server + file_server = stack.enter_context(SimpleHTTPTestServer()) + file_server.start() + nc_dir = file_server.document_root + nc_url = file_server.uri nc_refs = [] for i in range(3): nc_name = f"test-file-{i}.nc" - nc_path = os.path.join(tmp_dir, nc_name) - nc_refs.append(f"file://{nc_path}") - with open(os.path.join(tmp_dir, nc_name), mode="w", encoding="utf-8") as tmp_file: + nc_path = os.path.join(nc_dir, nc_name) + nc_href = f"{nc_url}/{nc_name}" + nc_refs.append(nc_href) + with open(nc_path, mode="w", encoding="utf-8") as tmp_file: tmp_file.write(f"DUMMY NETCDF DATA #{i}") + + tmp_dir = stack.enter_context(tempfile.TemporaryDirectory()) with open(os.path.join(tmp_dir, "netcdf-array.json"), mode="w", encoding="utf-8") as tmp_file: json.dump(nc_refs, tmp_file) # must match execution body diff --git a/tests/functional/utils.py b/tests/functional/utils.py index b11c00e40..f6be09217 100644 --- a/tests/functional/utils.py +++ b/tests/functional/utils.py @@ -164,7 +164,7 @@ def retrieve_payload(cls, process, ref_type=None, ref_name=None, ref_found=False if local: var_locations = [APP_PKG_ROOT] else: - base_url = "https://raw.githubusercontent.com/" + base_url = "https://raw.githubusercontent.com" var_locations = list(dict.fromkeys([ # don't use set to preserve this prioritized order APP_PKG_ROOT, os.getenv("TEST_GITHUB_SOURCE_URL"), diff --git a/tests/test_utils.py b/tests/test_utils.py index c22432f1d..7c4b325af 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -67,6 +67,7 @@ get_sane_name, get_secure_directory_name, get_secure_filename, + get_secure_path, get_ssl_verify_option, get_url_without_query, is_update_version, @@ -805,6 +806,20 @@ def mock_uuid(): assert result == fake_uuid +@pytest.mark.parametrize( + ["test_path", "expect_path"], + [ + ("/tmp/././/../.././test.txt", "/tmp/test.txt"), + ("./../../../../../tmp/test.txt", "tmp/test.txt"), + ("file://./../../../../../tmp/test.txt", "file://tmp/test.txt"), + ] +) +def test_get_secure_path(test_path, expect_path): + # type: (str, str) -> None + result = get_secure_path(test_path) + assert result == expect_path + + @pytest.mark.parametrize( [ "include_dir_heading", diff --git a/tests/utils.py b/tests/utils.py index 810071dcd..389f4e2f7 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -32,6 +32,7 @@ from pyramid.httpexceptions import HTTPException, HTTPNotFound, HTTPUnprocessableEntity from pyramid.registry import Registry from pyramid.testing import DummyRequest +from pytest_server_fixtures.http import HTTPTestServer, SimpleHTTPTestServer from requests import Response from webtest import TestApp, TestResponse @@ -845,6 +846,30 @@ def mocked_dir_listing(local_directory, # type: str return dir_html +class FileServer(SimpleHTTPTestServer): + """ + Generate a file server that can host files under :attr:`document_root`. + + Contrary to :func:`mocked_file_server` where requests are captured and redirected to the corresponding files, + this server receives *real* requests (via a socket) and returns matched files. This is particularly important + during tests that call subprocesses independently of the main Python test process + (e.g.: :term:`CWL` ``CommandLineTool` for a ``Builtin`` :term:`Process`), because mocks applied in the main + process are not reflected in the other ones. + + .. warning:: + This server takes more time to start than usual mocks. Use it sparingly, and consider maintaining a single + instance over multiple tests of a complete test suite rather than recreating a server for each test. + """ + def __init__(self): + self._port = self.get_port() + self._uri = f"http://0.0.0.0:{self._port}" + + # purposely call 'HTTPTestServer' instead of 'SimpleHTTPTestServer' to enforce the URI + # otherwise, 'socket.gethostname()' is used (machine name), and the obtained URI fail our schema validation + HTTPTestServer.__init__(self, hostname="0.0.0.0", port=self._port, uri=self._uri) + self.cwd = self.document_root + + def mocked_file_server(directory, # type: str url, # type: str settings, # type: SettingsType diff --git a/weaver/cli.py b/weaver/cli.py index a908da28c..3e9baa13a 100644 --- a/weaver/cli.py +++ b/weaver/cli.py @@ -1010,7 +1010,7 @@ def execute(self, Execution requests are always accomplished asynchronously. To obtain the final :term:`Job` status as if they were executed synchronously, provide the :paramref:`monitor` argument. This offers more flexibility over servers that could decide to ignore sync/async preferences, and avoids closing/timeout connection - errors that could occur for long running processes, since status is pooled iteratively rather than waiting. + errors that could occur for long-running processes, since status is pooled iteratively rather than waiting. :param process_id: Identifier of the local or remote process to execute. :param provider_id: Identifier of the provider from which to locate a remote process to execute. diff --git a/weaver/processes/builtin/__init__.py b/weaver/processes/builtin/__init__.py index 3932aad24..00225813a 100644 --- a/weaver/processes/builtin/__init__.py +++ b/weaver/processes/builtin/__init__.py @@ -32,7 +32,7 @@ from cwltool.context import RuntimeContext from cwltool.pathmapper import PathMapper - from weaver.typedefs import AnyRegistryContainer, CWL, CWL_RequirementsList, JSON, TypedDict + from weaver.typedefs import AnySettingsContainer, CWL, CWL_RequirementsList, JSON, TypedDict BuiltinResourceMap = TypedDict("BuiltinResourceMap", { "package": str, @@ -123,11 +123,17 @@ def _get_builtin_package(process_id, package): def register_builtin_processes(container): - # type: (AnyRegistryContainer) -> None + # type: (AnySettingsContainer) -> None """ Registers every ``builtin`` CWL package to the processes database. CWL definitions must be located within the :mod:`weaver.processes.builtin` module. + + .. note:: + Although any settings can be provided as input, it is better to specify a + :class:`pyramid.registry.Registry` or :class:`pyramid.request.Request` instance + in order to retrieve any pre-established database connection stored as reference. + Specifying configuration settings will force recreation of a new database connection. """ restapi_url = get_wps_restapi_base_url(container) builtin_apps_mapping = _get_builtin_reference_mapping(os.path.abspath(os.path.dirname(__file__))) diff --git a/weaver/processes/builtin/file2string_array.py b/weaver/processes/builtin/file2string_array.py index 2e9cbdad3..5bbc7e5c4 100644 --- a/weaver/processes/builtin/file2string_array.py +++ b/weaver/processes/builtin/file2string_array.py @@ -9,16 +9,26 @@ import sys CUR_DIR = os.path.abspath(os.path.dirname(__file__)) +sys.path.insert(0, CUR_DIR) +# root to allow 'from weaver import <...>' +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(CUR_DIR)))) + +# place weaver specific imports after sys path fixing to ensure they are found from external call +# pylint: disable=C0413,wrong-import-order +from weaver import WEAVER_ROOT_DIR # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import validate_file_reference # isort:skip # noqa: E402 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] +PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0] +PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") # setup logger since it is not run from the main 'weaver' app -LOGGER = logging.getLogger(__name__) +LOGGER = logging.getLogger(PACKAGE_MODULE) LOGGER.addHandler(logging.StreamHandler(sys.stdout)) LOGGER.setLevel(logging.INFO) # process details -__version__ = "1.2" +__version__ = "1.3" __title__ = "File to String-Array" __abstract__ = __doc__ # NOTE: '__doc__' is fetched directly, this is mostly to be informative @@ -26,8 +36,9 @@ def process(input_file, output_dir): - # type: (argparse.FileType, str) -> None + # type: (str, str) -> None LOGGER.info("Got arguments: input_file=%s output_dir=%s", input_file, output_dir) + validate_file_reference(input_file) output_data = {"output": [input_file]} with open(os.path.join(output_dir, OUTPUT_CWL_JSON), mode="w", encoding="utf-8") as file: return json.dump(output_data, file) diff --git a/weaver/processes/builtin/file_index_selector.py b/weaver/processes/builtin/file_index_selector.py index 37b6fcc22..93fb96bb6 100644 --- a/weaver/processes/builtin/file_index_selector.py +++ b/weaver/processes/builtin/file_index_selector.py @@ -6,7 +6,6 @@ import argparse import logging import os -import shutil import sys from typing import TYPE_CHECKING @@ -18,15 +17,23 @@ # root to allow 'from weaver import <...>' sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(CUR_DIR)))) +# place weaver specific imports after sys path fixing to ensure they are found from external call +# pylint: disable=C0413,wrong-import-order +from weaver import WEAVER_ROOT_DIR # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import validate_file_reference # isort:skip # noqa: E402 +from weaver.utils import OutputMethod, fetch_file # isort:skip # noqa: E402 + PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] +PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0] +PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") # setup logger since it is not run from the main 'weaver' app -LOGGER = logging.getLogger(__name__) +LOGGER = logging.getLogger(PACKAGE_MODULE) LOGGER.addHandler(logging.StreamHandler(sys.stdout)) LOGGER.setLevel(logging.INFO) # process details -__version__ = "1.1" +__version__ = "1.2" __title__ = "File Index Selector" __abstract__ = __doc__ # NOTE: '__doc__' is fetched directly, this is mostly to be informative @@ -38,7 +45,9 @@ def select(files, index, output_dir): try: if not os.path.isdir(output_dir): raise ValueError(f"Output dir [{output_dir}] does not exist.") - shutil.copy(files[index], output_dir) + file_path = files[index] + validate_file_reference(file_path) + fetch_file(file_path, output_dir, out_method=OutputMethod.COPY) except Exception as exc: # log only debug for tracking, re-raise and actual error wil be logged by top process monitor LOGGER.debug("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc) diff --git a/weaver/processes/builtin/jsonarray2netcdf.py b/weaver/processes/builtin/jsonarray2netcdf.py index 0b45985de..366d8d926 100644 --- a/weaver/processes/builtin/jsonarray2netcdf.py +++ b/weaver/processes/builtin/jsonarray2netcdf.py @@ -17,20 +17,24 @@ # place weaver specific imports after sys path fixing to ensure they are found from external call # pylint: disable=C0413,wrong-import-order -from weaver.processes.builtin.utils import is_netcdf_url # isort:skip # noqa: E402 -from weaver.utils import fetch_file # isort:skip # noqa: E402 +from weaver import WEAVER_ROOT_DIR # isort:skip # noqa: E402 +from weaver.formats import repr_json # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import is_netcdf_url, validate_file_reference # isort:skip # noqa: E402 +from weaver.utils import fetch_file, get_secure_path # isort:skip # noqa: E402 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] +PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0] +PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") # setup logger since it is not run from the main 'weaver' app -LOGGER = logging.getLogger(PACKAGE_NAME) +LOGGER = logging.getLogger(PACKAGE_MODULE) _handler = logging.StreamHandler(sys.stdout) # noqa _handler.setFormatter(logging.Formatter(fmt="[%(name)s] %(levelname)-8s %(message)s")) LOGGER.addHandler(_handler) -LOGGER.setLevel(logging.DEBUG) +LOGGER.setLevel(logging.INFO) # process details -__version__ = "2.2" +__version__ = "2.3" __title__ = "JSON array to NetCDF" __abstract__ = __doc__ # NOTE: '__doc__' is fetched directly, this is mostly to be informative @@ -38,24 +42,31 @@ def j2n(json_reference, output_dir): # type: (str, str) -> None LOGGER.info("Process '%s' execution starting...", PACKAGE_NAME) - LOGGER.debug("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir) + LOGGER.info("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir) try: if not os.path.isdir(output_dir): raise ValueError(f"Output dir [{output_dir}] does not exist.") with TemporaryDirectory(prefix=f"wps_process_{PACKAGE_NAME}_") as tmp_dir: - LOGGER.debug("Fetching JSON file: [%s]", json_reference) + LOGGER.info("Verify URL reference: [%s]", json_reference) + validate_file_reference(json_reference) + LOGGER.info("Fetching JSON file: [%s]", json_reference) json_path = fetch_file(json_reference, tmp_dir, timeout=10, retry=3) - LOGGER.debug("Reading JSON file: [%s]", json_path) + json_path = get_secure_path(json_path) + LOGGER.info("Reading JSON file: [%s]", json_path) with open(json_path, mode="r", encoding="utf-8") as json_file: json_content = json.load(json_file) - if not isinstance(json_content, list) or any(not is_netcdf_url(f) for f in json_content): + if not isinstance(json_content, list) or any(not isinstance(item, str) for item in json_content): LOGGER.error("Invalid JSON: [%s]", json_content) raise ValueError("Invalid JSON file format, expected a plain array of NetCDF file URL strings.") - LOGGER.debug("Parsing JSON file references.") + LOGGER.info("Parsing JSON file references from file contents:\n%s", repr_json(json_content)) for file_url in json_content: - LOGGER.debug("Fetching NetCDF reference from JSON file: [%s]", file_url) + LOGGER.info("Validate NetCDF reference from JSON file: [%s]", file_url) + validate_file_reference(file_url) + if not is_netcdf_url(file_url): + raise ValueError(f"Invalid file format for [{file_url}], expected a NetCDF file URL.") + LOGGER.info("Fetching NetCDF reference from JSON file: [%s]", file_url) fetched_nc = fetch_file(file_url, output_dir, timeout=10, retry=3) - LOGGER.debug("Fetched NetCDF output location: [%s]", fetched_nc) + LOGGER.info("Fetched NetCDF output location: [%s]", fetched_nc) except Exception as exc: LOGGER.error("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc) raise diff --git a/weaver/processes/builtin/metalink2netcdf.cwl b/weaver/processes/builtin/metalink2netcdf.cwl index e962ff71d..e433e4c63 100644 --- a/weaver/processes/builtin/metalink2netcdf.cwl +++ b/weaver/processes/builtin/metalink2netcdf.cwl @@ -10,6 +10,9 @@ inputs: inputBinding: position: 1 prefix: "-i" + format: + - iana:application/metalink+xml + - iana:application/metalink4+xml index: type: int inputBinding: diff --git a/weaver/processes/builtin/metalink2netcdf.py b/weaver/processes/builtin/metalink2netcdf.py index dc3994c4b..df3000ed0 100644 --- a/weaver/processes/builtin/metalink2netcdf.py +++ b/weaver/processes/builtin/metalink2netcdf.py @@ -16,18 +16,21 @@ # place weaver specific imports after sys path fixing to ensure they are found from external call # pylint: disable=C0413,wrong-import-order -from weaver import xml_util # isort:skip # noqa: E402 +from weaver import WEAVER_ROOT_DIR, xml_util # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import validate_file_reference # isort:skip # noqa: E402 from weaver.utils import fetch_file # isort:skip # noqa: E402 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] +PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0] +PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") # setup logger since it is not run from the main 'weaver' app -LOGGER = logging.getLogger(__name__) +LOGGER = logging.getLogger(PACKAGE_MODULE) LOGGER.addHandler(logging.StreamHandler(sys.stdout)) LOGGER.setLevel(logging.INFO) # process details -__version__ = "1.2" +__version__ = "1.3" __title__ = "Metalink to NetCDF" __abstract__ = __doc__ # NOTE: '__doc__' is fetched directly, this is mostly to be informative @@ -38,23 +41,24 @@ def m2n(metalink_reference, index, output_dir): "Got arguments: metalink_reference=%s index=%s output_dir=%s", metalink_reference, index, output_dir ) LOGGER.info("Process '%s' execution starting...", PACKAGE_NAME) - LOGGER.debug("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir) + LOGGER.info("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir) try: if not os.path.isdir(output_dir): raise ValueError(f"Output dir [{output_dir}] does not exist.") + validate_file_reference(metalink_reference) with TemporaryDirectory(prefix=f"wps_process_{PACKAGE_NAME}_") as tmp_dir: - LOGGER.debug("Fetching Metalink file: [%s]", metalink_reference) + LOGGER.info("Fetching Metalink file: [%s]", metalink_reference) metalink_path = fetch_file(metalink_reference, tmp_dir, timeout=10, retry=3) - LOGGER.debug("Reading Metalink file: [%s]", metalink_path) + LOGGER.info("Reading Metalink file: [%s]", metalink_path) xml_data = xml_util.parse(metalink_path) - LOGGER.debug("Parsing Metalink file references.") + LOGGER.info("Parsing Metalink file references.") nc_file_url = xml_data.xpath(f"string(//metalink/file[{index}]/metaurl)") - LOGGER.debug("Fetching NetCDF reference from Metalink file: [%s]", metalink_reference) - LOGGER.debug("NetCDF file URL : %s", nc_file_url) + LOGGER.info("Fetching NetCDF reference from Metalink file: [%s]", metalink_reference) + LOGGER.info("NetCDF file URL : %s", nc_file_url) fetch_file(nc_file_url, output_dir) except Exception as exc: # log only debug for tracking, re-raise and actual error wil be logged by top process monitor - LOGGER.debug("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc) + LOGGER.info("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc) raise LOGGER.info("Process '%s' execution completed.", PACKAGE_NAME) diff --git a/weaver/processes/builtin/utils.py b/weaver/processes/builtin/utils.py index b552ba094..7ec9a863f 100644 --- a/weaver/processes/builtin/utils.py +++ b/weaver/processes/builtin/utils.py @@ -7,8 +7,36 @@ def is_netcdf_url(url): # type: (Any) -> bool - if not isinstance(url, str): - return False - if urlparse(url).scheme == "": + """ + Validates that the reference is a remote NetCDF file reference. + """ + try: + validate_file_reference(url) + except (TypeError, ValueError): return False return os.path.splitext(url)[-1] == get_extension(ContentType.APP_NETCDF) + + +def validate_file_reference(url): + # type: (str) -> None + """ + Ensures that the provided reference points to a valide remote file or a temporary :term:`CWL` intermediate file. + + In order to avoid bypassing security validation of server file access between jobs, remote locations must be + enforced. However, :term:`CWL` temporary files must be allowed through for intermediate locations passed around + between :term:`Workflow` steps. + """ + if not isinstance(url, str): + raise TypeError(f"Not a valid URL: [{url!s}]") + if url.endswith("/"): + raise ValueError(f"Not a valid file URL reference [{url}]. Directory not supported.") + cwl_files = [ + "file:///tmp/cwltool_out_", + "file:///tmp/cwltool_tmp_", + "/tmp/cwltool_out_", + "/tmp/cwltool_tmp_", + ] + if any(url.startswith(path) for path in cwl_files): + return + if urlparse(url).scheme not in ["http", "https", "s3"]: + raise ValueError(f"Not a valid file URL reference [{url}]. Scheme not supported.") diff --git a/weaver/utils.py b/weaver/utils.py index 18cc4cfbe..c0911c0cd 100644 --- a/weaver/utils.py +++ b/weaver/utils.py @@ -1146,6 +1146,7 @@ def get_href_headers(path, # type: str """ Obtain headers applicable for the provided file or directory reference. + :rtype: object :param path: File to describe. Either a local path or remote URL. :param download_headers: If enabled, add the ``Content-Disposition`` header with attachment filename for downloading the file. @@ -2044,6 +2045,29 @@ def get_secure_directory_name(location): return unique_directory_name +def get_secure_path(location): + # type: (str) -> str + """ + Obtain a secure path location with validation of each nested component. + """ + # consider path with potential scheme + parts = location.split("://", 1) + if len(parts) > 1: + scheme, ref = parts + else: + scheme, ref = None, parts[0] + + # validate parts + parts = ref.split("/") + for i, part in enumerate(parts): + parts[i] = get_secure_filename(part) + start = "/" if ref.startswith("/") else "" + trail = "/" if ref.endswith("/") and ref != "/" else "" + secure_ref = start + "/".join(path for path in parts if path) + trail + secure_loc = f"{scheme}://{secure_ref}" if scheme else secure_ref + return secure_loc + + def download_file_http(file_reference, file_outdir, settings=None, callback=None, **request_kwargs): # type: (str, str, Optional[AnySettingsContainer], Optional[Callable[[str], None]], **Any) -> str """ @@ -2421,37 +2445,38 @@ def fetch_file(file_reference, # type: str :raises HTTPException: applicable HTTP-based exception if any occurred during the operation. :raises ValueError: when the reference scheme cannot be identified. """ - if file_reference.startswith("file://"): - file_reference = file_reference[7:] - file_href = file_reference - file_name = os.path.basename(os.path.realpath(file_reference)) # resolve any different name to use the original + file_href = file_reference # keep original for reporting in case of error + if file_href.startswith("file://"): + file_href = file_href[7:] + file_name = os.path.basename(os.path.realpath(file_href)) # resolve any different name to use the original file_name = get_secure_filename(file_name) file_path = os.path.join(file_outdir, file_name) LOGGER.debug("Fetching file reference: [%s] using options:\n%s", file_href, repr_json(option_kwargs)) options, kwargs = resolve_scheme_options(**option_kwargs) - if os.path.isfile(file_reference): + if os.path.isfile(file_href): LOGGER.debug("Fetch file resolved as local reference.") + file_href = get_secure_path(file_href) file_path = adjust_file_local(file_href, file_outdir, out_method) - elif file_reference.startswith("s3://"): + elif file_href.startswith("s3://"): LOGGER.debug("Fetch file resolved as S3 bucket reference.") s3_params = resolve_s3_http_options(**options["http"], **kwargs) s3_region = options["s3"].pop("region_name", None) - s3_bucket, file_key, s3_region_ref = resolve_s3_reference(file_reference) + s3_bucket, file_key, s3_region_ref = resolve_s3_reference(file_href) if s3_region and s3_region_ref and s3_region != s3_region_ref: raise ValueError("Invalid AWS S3 reference. " f"Input region name [{s3_region}] mismatches reference region [{s3_region_ref}].") s3_region = s3_region_ref or s3_region s3_client = boto3.client("s3", region_name=s3_region, **s3_params) # type: S3Client s3_client.download_file(s3_bucket, file_key, file_path, Callback=callback) - elif file_reference.startswith("http"): + elif file_href.startswith("http"): # pseudo-http URL referring to S3 bucket, try to redirect to above S3 handling method if applicable - if file_reference.startswith("https://s3.") or urlparse(file_reference).hostname.endswith(".amazonaws.com"): + if file_href.startswith("https://s3.") or urlparse(file_href).hostname.endswith(".amazonaws.com"): LOGGER.debug("Detected HTTP-like S3 bucket file reference. Retrying file fetching with S3 reference.") - s3_ref, s3_region = resolve_s3_from_http(file_reference) + s3_ref, s3_region = resolve_s3_from_http(file_href) option_kwargs.pop("s3_region", None) return fetch_file(s3_ref, file_outdir, settings=settings, s3_region_name=s3_region, **option_kwargs) file_path = download_file_http( - file_reference, + file_href, file_outdir, settings=settings, callback=callback, @@ -2459,7 +2484,7 @@ def fetch_file(file_reference, # type: str **kwargs ) else: - scheme = file_reference.split("://") + scheme = file_reference.split("://", 1) scheme = "" if len(scheme) < 2 else scheme[0] raise ValueError( f"Unresolved location and/or fetch file scheme: '{scheme!s}', " @@ -3122,13 +3147,16 @@ def adjust_directory_local(location, # type: Path LOGGER.debug("Local directory reference [%s] matches output, but desired listing differs. " "Removing additional items:\n%s", loc_dir, repr_json(extras)) for file_path in extras: - os.remove(file_path) + file_path = get_secure_path(file_path) + if os.path.isfile(file_path): + os.remove(file_path) return filtered # Any operation (islink, remove, etc.) that must operate on the link itself rather than the directory it points # to must not have the final '/' in the path. Otherwise, the link path (without final '/') is resolved before # evaluating the operation, which make them attempt their call on the real directory itself. link_dir = location.rstrip("/") + link_dir = get_secure_path(link_dir) if (os.path.exists(out_dir) and not os.path.isdir(out_dir)) or (os.path.isdir(out_dir) and os.listdir(out_dir)): LOGGER.debug("References under [%s] cannot be placed under target path [%s] " diff --git a/weaver/vault/utils.py b/weaver/vault/utils.py index 8e9b9efca..29cce8663 100644 --- a/weaver/vault/utils.py +++ b/weaver/vault/utils.py @@ -13,7 +13,7 @@ from weaver.datatype import VaultFile from weaver.formats import repr_json from weaver.store.base import StoreVault -from weaver.utils import get_header, get_settings, get_weaver_url, is_uuid +from weaver.utils import get_header, get_secure_path, get_settings, get_weaver_url, is_uuid from weaver.wps_restapi import swagger_definitions as sd if TYPE_CHECKING: @@ -54,7 +54,9 @@ def get_vault_path(file, container=None): Get the full path of the vault file. """ vault_dir = get_vault_dir(container) - return os.path.join(vault_dir, file.name) + vault_path = os.path.join(vault_dir, file.name) + vault_path = get_secure_path(vault_path) + return vault_path def get_vault_url(file, container=None): @@ -269,4 +271,5 @@ def decrypt_from_vault(vault_file, path, out_dir=None, delete_encrypted=False): out_file.seek(0) if delete_encrypted: os.remove(path) - return out_file.name + vault_path = get_secure_path(out_file.name) + return vault_path diff --git a/weaver/vault/views.py b/weaver/vault/views.py index 1a410696b..a80b8de96 100644 --- a/weaver/vault/views.py +++ b/weaver/vault/views.py @@ -127,7 +127,7 @@ def describe_file(request): ) headers["Content-Location"] = get_vault_url(vault_file, request) finally: - if os.path.isfile(tmp_file): + if tmp_file and os.path.isfile(tmp_file): os.remove(tmp_file) return HTTPHeadFileResponse(code=200, headers=headers) diff --git a/weaver/wps_restapi/jobs/utils.py b/weaver/wps_restapi/jobs/utils.py index 7accb0a33..445a7fe35 100644 --- a/weaver/wps_restapi/jobs/utils.py +++ b/weaver/wps_restapi/jobs/utils.py @@ -44,6 +44,7 @@ get_href_headers, get_path_kvp, get_sane_name, + get_secure_path, get_settings, get_weaver_url, is_uuid @@ -295,6 +296,7 @@ def make_result_link(result_id, result, job_id, settings): loc = os.path.join(str(job_id), f"{result_id}{suffix}.txt") url = f"{wps_url}/{loc}" path = os.path.join(out, loc) + path = get_secure_path(path) with open(path, mode="w", encoding=enc) as out_file: out_file.write(val) else: From b91dd8431cbe405639c949eb0318920e89c09ac0 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Fri, 3 Nov 2023 13:10:22 -0400 Subject: [PATCH 2/5] fix linting + update versions of modified builtin processes --- tests/functional/test_builtin.py | 2 +- tests/utils.py | 6 +++--- weaver/processes/builtin/file2string_array.py | 2 +- weaver/processes/builtin/file_index_selector.py | 4 ++-- weaver/processes/builtin/jsonarray2netcdf.py | 2 +- weaver/processes/builtin/metalink2netcdf.py | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/functional/test_builtin.py b/tests/functional/test_builtin.py index 54f990e2a..95b22e95d 100644 --- a/tests/functional/test_builtin.py +++ b/tests/functional/test_builtin.py @@ -165,7 +165,7 @@ def test_jsonarray2netcdf_execute_invalid_file_local(self): Validate that local file path as input is not permitted anymore. """ with contextlib.ExitStack() as stack_exec: - body, nc_data = self.setup_inputs(stack_exec, use_temp_file=True) + body, _ = self.setup_inputs(stack_exec, use_temp_file=True) body.update({ "mode": ExecuteMode.ASYNC, "response": ExecuteResponse.DOCUMENT, diff --git a/tests/utils.py b/tests/utils.py index dc081697b..31d3edcc9 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -884,13 +884,13 @@ class FileServer(SimpleHTTPTestServer): This server takes more time to start than usual mocks. Use it sparingly, and consider maintaining a single instance over multiple tests of a complete test suite rather than recreating a server for each test. """ - def __init__(self): + def __init__(self): # pylint: disable=W0231 self._port = self.get_port() self._uri = f"http://0.0.0.0:{self._port}" # purposely call 'HTTPTestServer' instead of 'SimpleHTTPTestServer' to enforce the URI - # otherwise, 'socket.gethostname()' is used (machine name), and the obtained URI fail our schema validation - HTTPTestServer.__init__(self, hostname="0.0.0.0", port=self._port, uri=self._uri) + # otherwise, 'socket.gethostname()' is used (machine name), and the obtained URI fails our schema validation + HTTPTestServer.__init__(self, hostname="0.0.0.0", port=self._port, uri=self._uri) # pylint: disable=W0233 self.cwd = self.document_root diff --git a/weaver/processes/builtin/file2string_array.py b/weaver/processes/builtin/file2string_array.py index 5bbc7e5c4..858384843 100644 --- a/weaver/processes/builtin/file2string_array.py +++ b/weaver/processes/builtin/file2string_array.py @@ -19,7 +19,7 @@ from weaver.processes.builtin.utils import validate_file_reference # isort:skip # noqa: E402 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] -PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0] +PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0] PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") # setup logger since it is not run from the main 'weaver' app diff --git a/weaver/processes/builtin/file_index_selector.py b/weaver/processes/builtin/file_index_selector.py index 6a8d7b2f0..37b245426 100644 --- a/weaver/processes/builtin/file_index_selector.py +++ b/weaver/processes/builtin/file_index_selector.py @@ -24,7 +24,7 @@ from weaver.utils import OutputMethod, fetch_file # isort:skip # noqa: E402 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] -PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0] +PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0] PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") # setup logger since it is not run from the main 'weaver' app @@ -33,7 +33,7 @@ LOGGER.setLevel(logging.INFO) # process details -__version__ = "1.2" +__version__ = "1.3" __title__ = "File Index Selector" __abstract__ = __doc__ # NOTE: '__doc__' is fetched directly, this is mostly to be informative diff --git a/weaver/processes/builtin/jsonarray2netcdf.py b/weaver/processes/builtin/jsonarray2netcdf.py index 5a022a69c..ce9edc2f8 100644 --- a/weaver/processes/builtin/jsonarray2netcdf.py +++ b/weaver/processes/builtin/jsonarray2netcdf.py @@ -23,7 +23,7 @@ from weaver.utils import fetch_file, get_secure_path # isort:skip # noqa: E402 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] -PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0] +PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0] PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") # setup logger since it is not run from the main 'weaver' app diff --git a/weaver/processes/builtin/metalink2netcdf.py b/weaver/processes/builtin/metalink2netcdf.py index 106ec11a0..f31c49a4f 100644 --- a/weaver/processes/builtin/metalink2netcdf.py +++ b/weaver/processes/builtin/metalink2netcdf.py @@ -22,7 +22,7 @@ from weaver.processes.builtin.utils import is_netcdf_url # isort:skip # noqa: E402 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] -PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0] +PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0] PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") # setup logger since it is not run from the main 'weaver' app @@ -31,7 +31,7 @@ LOGGER.setLevel(logging.INFO) # process details -__version__ = "1.3" +__version__ = "1.4" __title__ = "Metalink to NetCDF" __abstract__ = __doc__ # NOTE: '__doc__' is fetched directly, this is mostly to be informative From 0c4abe3b2a6fe4b6a0c84a973f605ead1ff249cf Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Fri, 3 Nov 2023 18:00:57 -0400 Subject: [PATCH 3/5] fix broken tests due to new builtin input file path validation --- CHANGES.rst | 1 + tests/functional/test_builtin.py | 9 ++- tests/functional/test_cli.py | 2 +- tests/functional/test_workflow.py | 57 ++++++++++++------- tests/utils.py | 8 +-- weaver/processes/builtin/file2string_array.py | 4 +- .../processes/builtin/file_index_selector.py | 4 +- weaver/processes/builtin/jsonarray2netcdf.py | 6 +- weaver/processes/builtin/metalink2netcdf.py | 6 +- weaver/processes/builtin/utils.py | 27 ++++++--- 10 files changed, 76 insertions(+), 48 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 2180e3df3..5c0e31782 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -12,6 +12,7 @@ Changes Changes: -------- +- Add more secure path validations steps before fetching contents. - Disallow ``builtin`` processes expecting a user-provided input path to run with local file references such that they must respect any configured server-side remote file access rules instead of bypassing security validations through resolved local paths. diff --git a/tests/functional/test_builtin.py b/tests/functional/test_builtin.py index 95b22e95d..bbe85886a 100644 --- a/tests/functional/test_builtin.py +++ b/tests/functional/test_builtin.py @@ -448,7 +448,7 @@ def test_jsonarray2netcdf_process(): [ "/tmp/does-not-exist/fake-file.txt", # noqa "/tmp/does-not-exist/fake-file.nc", # noqa - ] + ], ] ) def test_jsonarray2netcdf_invalid_json(test_data): @@ -461,7 +461,12 @@ def test_jsonarray2netcdf_invalid_json(test_data): with pytest.raises(ValueError) as err: jsonarray2netcdf.main("-i", tmp_file.name, "-o", tmp_out_dir) - assert str(err.value) == "Invalid JSON file format, expected a plain array of NetCDF file URL strings." + valid_errors = [ + "Invalid JSON file format, expected a plain array of NetCDF file URL strings.", + "Invalid file format", + "Not a valid file URL reference", + ] + assert any(error in str(err.value) for error in valid_errors), f"Raised error [{err.value}] was not expected" @pytest.mark.parametrize( diff --git a/tests/functional/test_cli.py b/tests/functional/test_cli.py index be888f9fa..e629f104c 100644 --- a/tests/functional/test_cli.py +++ b/tests/functional/test_cli.py @@ -521,7 +521,7 @@ def test_execute_manual_monitor_status_and_download_results(self): """ result = self.run_execute_inputs_schema_variant("Execute_Echo_cwl_schema.yml", mock_exec=False) job_id = result.body["jobID"] - result = mocked_sub_requests(self.app, self.client.monitor, job_id, timeout=1, interval=1) + result = mocked_sub_requests(self.app, self.client.monitor, job_id, timeout=5, interval=1) assert result.success, result.text assert "undefined" not in result.message assert result.body.get("status") == Status.SUCCEEDED diff --git a/tests/functional/test_workflow.py b/tests/functional/test_workflow.py index 16cdfaede..f82e52a44 100644 --- a/tests/functional/test_workflow.py +++ b/tests/functional/test_workflow.py @@ -15,7 +15,6 @@ import mock import pytest -from pytest_server_fixtures.http import SimpleHTTPTestServer from pyramid import testing from pyramid.httpexceptions import HTTPConflict, HTTPCreated, HTTPNotFound, HTTPOk from pyramid.settings import asbool @@ -25,6 +24,7 @@ from tests.functional.utils import ResourcesUtil from tests.utils import ( + FileServer, get_settings_from_config_ini, get_settings_from_testapp, get_test_weaver_app, @@ -127,7 +127,7 @@ def __init__(self, @pytest.mark.workflow class WorkflowTestRunnerBase(ResourcesUtil, TestCase): """ - Runs an end-2-end test procedure on weaver configured as EMS located on specified `WEAVER_TEST_SERVER_HOSTNAME`. + Runs an end-2-end test procedure on weaver configured as EMS located on specified ``WEAVER_TEST_SERVER_HOSTNAME``. """ __settings__ = None test_processes_info = {} # type: Dict[WorkflowProcesses, ProcessInfo] @@ -166,6 +166,11 @@ class WorkflowTestRunnerBase(ResourcesUtil, TestCase): logger_field_indent = 2 # type: int log_full_trace = True # type: bool + file_server = None # type: FileServer + """ + File server made available to tests for emulating a remote HTTP location. + """ + WEAVER_URL = None # type: Optional[str] WEAVER_RESTAPI_URL = None # type: Optional[str] @@ -215,15 +220,15 @@ def setUpClass(cls): cls.WEAVER_TEST_JOB_GET_STATUS_INTERVAL = int(cls.get_option("WEAVER_TEST_JOB_GET_STATUS_INTERVAL", 5)) # server settings - cls.WEAVER_TEST_CONFIGURATION = cls.get_option("WEAVER_TEST_CONFIGURATION", WeaverConfiguration.EMS) - cls.WEAVER_TEST_SERVER_HOSTNAME = cls.get_option("WEAVER_TEST_SERVER_HOSTNAME", "") + cls.WEAVER_TEST_CONFIGURATION = conf = cls.get_option("WEAVER_TEST_CONFIGURATION", WeaverConfiguration.EMS) + cls.WEAVER_TEST_SERVER_HOSTNAME = host = cls.get_option("WEAVER_TEST_SERVER_HOSTNAME", "") cls.WEAVER_TEST_SERVER_BASE_PATH = cls.get_option("WEAVER_TEST_SERVER_BASE_PATH", "/weaver") cls.WEAVER_TEST_SERVER_API_PATH = cls.get_option("WEAVER_TEST_SERVER_API_PATH", "/") cls.WEAVER_TEST_CONFIG_INI_PATH = cls.get_option("WEAVER_TEST_CONFIG_INI_PATH") # none uses default path - if cls.WEAVER_TEST_SERVER_HOSTNAME in [None, ""]: + if host in [None, ""]: # running with a local-only Web Test application config = setup_config_with_mongodb(settings={ - "weaver.configuration": cls.WEAVER_TEST_CONFIGURATION, + "weaver.configuration": conf, # NOTE: # Because everything is running locally in this case, all processes should automatically map between # the two following dir/URL as equivalents locations, accordingly to what they require for execution. @@ -239,10 +244,10 @@ def setUpClass(cls): os.makedirs(cls.__settings__["weaver.wps_output_dir"], exist_ok=True) else: # running on a remote service (remote server or can be "localhost", but in parallel application) - if cls.WEAVER_TEST_SERVER_HOSTNAME.startswith("http"): - url = cls.WEAVER_TEST_SERVER_HOSTNAME + if host.startswith("http"): + url = host else: - url = f"http://{cls.WEAVER_TEST_SERVER_HOSTNAME}" + url = f"http://{host}" cls.app = WebTestApp(url) cls.WEAVER_URL = get_weaver_url(cls.settings.fget(cls)) cls.WEAVER_RESTAPI_URL = get_wps_restapi_base_url(cls.settings.fget(cls)) @@ -252,6 +257,9 @@ def setUpClass(cls): cls.setup_test_processes() cls.setup_test_processes_after() + cls.file_server = FileServer() + cls.file_server.start() + @property def settings(self): # type: (...) -> SettingsType @@ -276,6 +284,7 @@ def settings(self): @classmethod def tearDownClass(cls): cls.clean_test_processes() + cls.file_server.teardown() testing.tearDown() cls.log("%sEnd of '%s': %s\n%s", cls.logger_separator_cases, cls.current_case_name(), now(), cls.logger_separator_cases) @@ -952,7 +961,7 @@ def test_workflow_mixed_wps1_builtin_rest_docker_select_requirements(self): .. note:: Because ``jsonarray2netcdf`` is running in subprocess instantiated by :mod:`cwltool`, file-server - location cannot be mocked by the test suite. Employ local test paths as if they where already fetched. + location cannot be mocked by the test suite. Employ local test paths as if they were already fetched. """ with contextlib.ExitStack() as stack: @@ -1007,10 +1016,8 @@ def test_workflow_mixed_rest_builtin_wps1_docker_select_requirements(self): with contextlib.ExitStack() as stack: tmp_host = "https://mocked-file-server.com" # must match in 'Execute_WorkflowSelectCopyNestedOutDir.json' # jsonarray2netcdf cannot use local paths anymore for nested NetCDF, provide them through tmp file server - file_server = stack.enter_context(SimpleHTTPTestServer()) - file_server.start() - nc_dir = file_server.document_root - nc_url = file_server.uri + nc_dir = self.file_server.document_root + nc_url = self.file_server.uri nc_refs = [] for i in range(3): nc_name = f"test-file-{i}.nc" @@ -1059,7 +1066,9 @@ def test_workflow_mixed_rest_builtin_wps1_docker_scatter_requirements(self): .. note:: Because ``jsonarray2netcdf`` is running in subprocess instantiated by :mod:`cwltool`, file-server - location cannot be mocked by the test suite. Employ local test paths as if they where already fetched. + location cannot be mocked by the test suite. With security checks, they cannot be locally defined either, + since the :term:`CWL` temporary directory is not known in advance for respective ``CommandLineTool`` steps. + Employ a *real* file-server for test paths to emulate a remote location to be fetched. .. seealso:: Inverse :term:`WPS` / :term:`OGC API - Processes` process references from @@ -1072,9 +1081,10 @@ def test_workflow_mixed_rest_builtin_wps1_docker_scatter_requirements(self): nc_refs = [] for i in range(3): nc_name = f"test-file-{i}.nc" - nc_path = os.path.join(tmp_dir, nc_name) - nc_refs.append(f"file://{nc_path}") - with open(os.path.join(tmp_dir, nc_name), mode="w", encoding="utf-8") as tmp_file: + nc_path = os.path.join(self.file_server.document_root, nc_name) + nc_href = os.path.join(self.file_server.uri, nc_name) + nc_refs.append(nc_href) + with open(nc_path, mode="w", encoding="utf-8") as tmp_file: tmp_file.write(f"DUMMY NETCDF DATA #{i}") with open(os.path.join(tmp_dir, "netcdf-array.json"), mode="w", encoding="utf-8") as tmp_file: json.dump(nc_refs, tmp_file) # must match execution body @@ -1099,7 +1109,9 @@ def test_workflow_mixed_wps1_builtin_rest_docker_scatter_requirements(self): .. note:: Because ``jsonarray2netcdf`` is running in subprocess instantiated by :mod:`cwltool`, file-server - location cannot be mocked by the test suite. Employ local test paths as if they where already fetched. + location cannot be mocked by the test suite. With security checks, they cannot be locally defined either, + since the :term:`CWL` temporary directory is not known in advance for respective ``CommandLineTool`` steps. + Employ a *real* file-server for test paths to emulate a remote location to be fetched. .. seealso:: Inverse :term:`WPS` / :term:`OGC API - Processes` process references from @@ -1112,9 +1124,10 @@ def test_workflow_mixed_wps1_builtin_rest_docker_scatter_requirements(self): nc_refs = [] for i in range(3): nc_name = f"test-file-{i}.nc" - nc_path = os.path.join(tmp_dir, nc_name) - nc_refs.append(f"file://{nc_path}") - with open(os.path.join(tmp_dir, nc_name), mode="w", encoding="utf-8") as tmp_file: + nc_path = os.path.join(self.file_server.document_root, nc_name) + nc_href = os.path.join(self.file_server.uri, nc_name) + nc_refs.append(nc_href) + with open(nc_path, mode="w", encoding="utf-8") as tmp_file: tmp_file.write(f"DUMMY NETCDF DATA #{i}") with open(os.path.join(tmp_dir, "netcdf-array.json"), mode="w", encoding="utf-8") as tmp_file: json.dump(nc_refs, tmp_file) # must match execution body diff --git a/tests/utils.py b/tests/utils.py index 31d3edcc9..f9eb4c7b8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -877,18 +877,18 @@ class FileServer(SimpleHTTPTestServer): Contrary to :func:`mocked_file_server` where requests are captured and redirected to the corresponding files, this server receives *real* requests (via a socket) and returns matched files. This is particularly important during tests that call subprocesses independently of the main Python test process - (e.g.: :term:`CWL` ``CommandLineTool` for a ``Builtin`` :term:`Process`), because mocks applied in the main - process are not reflected in the other ones. + (e.g.: :term:`CWL` ``CommandLineTool` for a ``Builtin`` :term:`Process`), because mocks applied in the main test + process are not reflected in the other subprocesses. .. warning:: This server takes more time to start than usual mocks. Use it sparingly, and consider maintaining a single - instance over multiple tests of a complete test suite rather than recreating a server for each test. + instance over multiple tests of a complete test suite rather than recreating a server for each test. """ def __init__(self): # pylint: disable=W0231 self._port = self.get_port() self._uri = f"http://0.0.0.0:{self._port}" - # purposely call 'HTTPTestServer' instead of 'SimpleHTTPTestServer' to enforce the URI + # purposely call 'HTTPTestServer' instead of 'SimpleHTTPTestServer' to enforce the URI hostname # otherwise, 'socket.gethostname()' is used (machine name), and the obtained URI fails our schema validation HTTPTestServer.__init__(self, hostname="0.0.0.0", port=self._port, uri=self._uri) # pylint: disable=W0233 self.cwd = self.document_root diff --git a/weaver/processes/builtin/file2string_array.py b/weaver/processes/builtin/file2string_array.py index 858384843..e5977462c 100644 --- a/weaver/processes/builtin/file2string_array.py +++ b/weaver/processes/builtin/file2string_array.py @@ -16,7 +16,7 @@ # place weaver specific imports after sys path fixing to ensure they are found from external call # pylint: disable=C0413,wrong-import-order from weaver import WEAVER_ROOT_DIR # isort:skip # noqa: E402 -from weaver.processes.builtin.utils import validate_file_reference # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import validate_reference # isort:skip # noqa: E402 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0] @@ -38,7 +38,7 @@ def process(input_file, output_dir): # type: (str, str) -> None LOGGER.info("Got arguments: input_file=%s output_dir=%s", input_file, output_dir) - validate_file_reference(input_file) + validate_reference(input_file, is_file=True) output_data = {"output": [input_file]} with open(os.path.join(output_dir, OUTPUT_CWL_JSON), mode="w", encoding="utf-8") as file: return json.dump(output_data, file) diff --git a/weaver/processes/builtin/file_index_selector.py b/weaver/processes/builtin/file_index_selector.py index 37b245426..6396a0814 100644 --- a/weaver/processes/builtin/file_index_selector.py +++ b/weaver/processes/builtin/file_index_selector.py @@ -20,7 +20,7 @@ # place weaver specific imports after sys path fixing to ensure they are found from external call # pylint: disable=C0413,wrong-import-order from weaver import WEAVER_ROOT_DIR # isort:skip # noqa: E402 -from weaver.processes.builtin.utils import validate_file_reference # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import validate_reference # isort:skip # noqa: E402 from weaver.utils import OutputMethod, fetch_file # isort:skip # noqa: E402 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] @@ -46,7 +46,7 @@ def select(files, index, output_dir): if not os.path.isdir(output_dir): raise ValueError(f"Output dir [{output_dir}] does not exist.") file_path = files[index] - validate_file_reference(file_path) + validate_reference(file_path, is_file=True) fetch_file(file_path, output_dir, out_method=OutputMethod.COPY) except Exception as exc: # log only debug for tracking, re-raise and actual error wil be logged by top process monitor diff --git a/weaver/processes/builtin/jsonarray2netcdf.py b/weaver/processes/builtin/jsonarray2netcdf.py index ce9edc2f8..1d97e951b 100644 --- a/weaver/processes/builtin/jsonarray2netcdf.py +++ b/weaver/processes/builtin/jsonarray2netcdf.py @@ -19,7 +19,7 @@ # pylint: disable=C0413,wrong-import-order from weaver import WEAVER_ROOT_DIR # isort:skip # noqa: E402 from weaver.formats import repr_json # isort:skip # noqa: E402 -from weaver.processes.builtin.utils import is_netcdf_url, validate_file_reference # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import is_netcdf_url, validate_reference # isort:skip # noqa: E402 from weaver.utils import fetch_file, get_secure_path # isort:skip # noqa: E402 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] @@ -48,7 +48,7 @@ def j2n(json_reference, output_dir): raise ValueError(f"Output directory [{output_dir}] does not exist.") with TemporaryDirectory(prefix=f"wps_process_{PACKAGE_NAME}_") as tmp_dir: LOGGER.info("Verify URL reference: [%s]", json_reference) - validate_file_reference(json_reference) + validate_reference(json_reference, is_file=True) LOGGER.info("Fetching JSON file: [%s]", json_reference) json_path = fetch_file(json_reference, tmp_dir, timeout=10, retry=3) json_path = get_secure_path(json_path) @@ -61,7 +61,7 @@ def j2n(json_reference, output_dir): LOGGER.info("Parsing JSON file references from file contents:\n%s", repr_json(json_content)) for file_url in json_content: LOGGER.info("Validate NetCDF reference from JSON file: [%s]", file_url) - validate_file_reference(file_url) + validate_reference(file_url, is_file=True) if not is_netcdf_url(file_url): raise ValueError(f"Invalid file format for [{file_url}], expected a NetCDF file URL.") LOGGER.info("Fetching NetCDF reference from JSON file: [%s]", file_url) diff --git a/weaver/processes/builtin/metalink2netcdf.py b/weaver/processes/builtin/metalink2netcdf.py index f31c49a4f..3173a7bbe 100644 --- a/weaver/processes/builtin/metalink2netcdf.py +++ b/weaver/processes/builtin/metalink2netcdf.py @@ -17,7 +17,7 @@ # place weaver specific imports after sys path fixing to ensure they are found from external call # pylint: disable=C0413,wrong-import-order from weaver import WEAVER_ROOT_DIR, xml_util # isort:skip # noqa: E402 -from weaver.processes.builtin.utils import validate_file_reference # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import validate_reference # isort:skip # noqa: E402 from weaver.utils import fetch_file # isort:skip # noqa: E402 from weaver.processes.builtin.utils import is_netcdf_url # isort:skip # noqa: E402 @@ -47,7 +47,7 @@ def m2n(metalink_reference, index, output_dir): if not os.path.isdir(output_dir): raise ValueError(f"Output dir [{output_dir}] does not exist.") LOGGER.info("Validating Metalink file: [%s]", metalink_reference) - validate_file_reference(metalink_reference) + validate_reference(metalink_reference, is_file=True) with TemporaryDirectory(prefix=f"wps_process_{PACKAGE_NAME}_") as tmp_dir: LOGGER.info("Fetching Metalink file: [%s]", metalink_reference) metalink_path = fetch_file(metalink_reference, tmp_dir, timeout=10, retry=3) @@ -67,7 +67,7 @@ def m2n(metalink_reference, index, output_dir): if not is_netcdf_url(nc_file_url): raise ValueError(f"Resolved file URL [{nc_file_url}] is not a valid NetCDF reference.") LOGGER.info("Validating NetCDF reference: [%s]", nc_file_url) - validate_file_reference(nc_file_url) + validate_reference(nc_file_url, is_file=True) LOGGER.info("Fetching NetCDF reference [%s] from Metalink file [%s]", nc_file_url, metalink_reference) fetch_file(nc_file_url, output_dir) except Exception as exc: diff --git a/weaver/processes/builtin/utils.py b/weaver/processes/builtin/utils.py index 7ec9a863f..fba144a5e 100644 --- a/weaver/processes/builtin/utils.py +++ b/weaver/processes/builtin/utils.py @@ -1,4 +1,5 @@ import os +import tempfile from typing import Any from urllib.parse import urlparse @@ -11,32 +12,40 @@ def is_netcdf_url(url): Validates that the reference is a remote NetCDF file reference. """ try: - validate_file_reference(url) + validate_reference(url, is_file=True) except (TypeError, ValueError): return False return os.path.splitext(url)[-1] == get_extension(ContentType.APP_NETCDF) -def validate_file_reference(url): - # type: (str) -> None +def validate_reference(url, is_file): + # type: (str, bool) -> None """ - Ensures that the provided reference points to a valide remote file or a temporary :term:`CWL` intermediate file. + Ensures that the provided reference points to a valid remote file or a temporary intermediate file. In order to avoid bypassing security validation of server file access between jobs, remote locations must be enforced. However, :term:`CWL` temporary files must be allowed through for intermediate locations passed around - between :term:`Workflow` steps. + between :term:`Workflow` steps or employed as temporary writing locations for file extraction purposes. """ if not isinstance(url, str): raise TypeError(f"Not a valid URL: [{url!s}]") - if url.endswith("/"): - raise ValueError(f"Not a valid file URL reference [{url}]. Directory not supported.") - cwl_files = [ + if (is_file and url.endswith("/")) or (not is_file and not url.endswith("/")): + dir_msg = "not supported" if is_file else "required" + raise ValueError(f"Not a valid file URL reference [{url}]. Directory path {dir_msg}.") + # When in a CWL step, tempdir will return the `/tmp/cwltool_tmp_...' path (since enforced by the tool). + # When executed in other situations, it will map to the environment variable or platform-specific tmp path. + # Although CWL will set TMPDIR for the current step, the source file could be coming from a previous step. + # Therefore, the random part of the path after 'cwltool_tmp_'/'cwltool_out_' could differ from the current ones. + tmp_dir = tempfile.gettempdir() + tmp_paths = [ + f"file://{tmp_dir}/", + f"{tmp_dir}/", "file:///tmp/cwltool_out_", "file:///tmp/cwltool_tmp_", "/tmp/cwltool_out_", "/tmp/cwltool_tmp_", ] - if any(url.startswith(path) for path in cwl_files): + if any(url.startswith(path) for path in tmp_paths): return if urlparse(url).scheme not in ["http", "https", "s3"]: raise ValueError(f"Not a valid file URL reference [{url}]. Scheme not supported.") From 31b82afe0f83dca52e1bbba30a10cee30c6929f7 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Fri, 3 Nov 2023 18:14:43 -0400 Subject: [PATCH 4/5] ignore purposely set temp paths --- weaver/processes/builtin/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/weaver/processes/builtin/utils.py b/weaver/processes/builtin/utils.py index fba144a5e..1db2a1e6e 100644 --- a/weaver/processes/builtin/utils.py +++ b/weaver/processes/builtin/utils.py @@ -42,8 +42,8 @@ def validate_reference(url, is_file): f"{tmp_dir}/", "file:///tmp/cwltool_out_", "file:///tmp/cwltool_tmp_", - "/tmp/cwltool_out_", - "/tmp/cwltool_tmp_", + "/tmp/cwltool_out_", # nosec: B108 + "/tmp/cwltool_tmp_", # nosec: B108 ] if any(url.startswith(path) for path in tmp_paths): return From b4f0fa1e46eaa1394ce7e2bb3b16134aec68df8b Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Fri, 3 Nov 2023 18:41:41 -0400 Subject: [PATCH 5/5] fix lint and reduce builtin process duplicated logic --- weaver/processes/builtin/file2string_array.py | 7 ++----- .../processes/builtin/file_index_selector.py | 7 ++----- weaver/processes/builtin/jsonarray2netcdf.py | 11 ++++++----- weaver/processes/builtin/metalink2netcdf.py | 8 +++----- weaver/processes/builtin/utils.py | 18 +++++++++++++++++- 5 files changed, 30 insertions(+), 21 deletions(-) diff --git a/weaver/processes/builtin/file2string_array.py b/weaver/processes/builtin/file2string_array.py index e5977462c..6c0b90eff 100644 --- a/weaver/processes/builtin/file2string_array.py +++ b/weaver/processes/builtin/file2string_array.py @@ -15,12 +15,9 @@ # place weaver specific imports after sys path fixing to ensure they are found from external call # pylint: disable=C0413,wrong-import-order -from weaver import WEAVER_ROOT_DIR # isort:skip # noqa: E402 -from weaver.processes.builtin.utils import validate_reference # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import get_package_details, validate_reference # isort:skip # noqa: E402 -PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] -PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0] -PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") +PACKAGE_NAME, PACKAGE_BASE, PACKAGE_MODULE = get_package_details(__file__) # setup logger since it is not run from the main 'weaver' app LOGGER = logging.getLogger(PACKAGE_MODULE) diff --git a/weaver/processes/builtin/file_index_selector.py b/weaver/processes/builtin/file_index_selector.py index 6396a0814..947f3bdcf 100644 --- a/weaver/processes/builtin/file_index_selector.py +++ b/weaver/processes/builtin/file_index_selector.py @@ -19,13 +19,10 @@ # place weaver specific imports after sys path fixing to ensure they are found from external call # pylint: disable=C0413,wrong-import-order -from weaver import WEAVER_ROOT_DIR # isort:skip # noqa: E402 -from weaver.processes.builtin.utils import validate_reference # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import get_package_details, validate_reference # isort:skip # noqa: E402 from weaver.utils import OutputMethod, fetch_file # isort:skip # noqa: E402 -PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] -PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0] -PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") +PACKAGE_NAME, PACKAGE_BASE, PACKAGE_MODULE = get_package_details(__file__) # setup logger since it is not run from the main 'weaver' app LOGGER = logging.getLogger(PACKAGE_MODULE) diff --git a/weaver/processes/builtin/jsonarray2netcdf.py b/weaver/processes/builtin/jsonarray2netcdf.py index 1d97e951b..012dbb84f 100644 --- a/weaver/processes/builtin/jsonarray2netcdf.py +++ b/weaver/processes/builtin/jsonarray2netcdf.py @@ -17,14 +17,15 @@ # place weaver specific imports after sys path fixing to ensure they are found from external call # pylint: disable=C0413,wrong-import-order -from weaver import WEAVER_ROOT_DIR # isort:skip # noqa: E402 from weaver.formats import repr_json # isort:skip # noqa: E402 -from weaver.processes.builtin.utils import is_netcdf_url, validate_reference # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import ( # isort:skip # noqa: E402 + get_package_details, + is_netcdf_url, + validate_reference +) from weaver.utils import fetch_file, get_secure_path # isort:skip # noqa: E402 -PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] -PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0] -PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") +PACKAGE_NAME, PACKAGE_BASE, PACKAGE_MODULE = get_package_details(__file__) # setup logger since it is not run from the main 'weaver' app LOGGER = logging.getLogger(PACKAGE_MODULE) diff --git a/weaver/processes/builtin/metalink2netcdf.py b/weaver/processes/builtin/metalink2netcdf.py index 3173a7bbe..44819bdf4 100644 --- a/weaver/processes/builtin/metalink2netcdf.py +++ b/weaver/processes/builtin/metalink2netcdf.py @@ -16,14 +16,12 @@ # place weaver specific imports after sys path fixing to ensure they are found from external call # pylint: disable=C0413,wrong-import-order -from weaver import WEAVER_ROOT_DIR, xml_util # isort:skip # noqa: E402 -from weaver.processes.builtin.utils import validate_reference # isort:skip # noqa: E402 +from weaver import xml_util # isort:skip # noqa: E402 +from weaver.processes.builtin.utils import get_package_details, validate_reference # isort:skip # noqa: E402 from weaver.utils import fetch_file # isort:skip # noqa: E402 from weaver.processes.builtin.utils import is_netcdf_url # isort:skip # noqa: E402 -PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1] -PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0] -PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".") +PACKAGE_NAME, PACKAGE_BASE, PACKAGE_MODULE = get_package_details(__file__) # setup logger since it is not run from the main 'weaver' app LOGGER = logging.getLogger(PACKAGE_MODULE) diff --git a/weaver/processes/builtin/utils.py b/weaver/processes/builtin/utils.py index 1db2a1e6e..c7c52ebf4 100644 --- a/weaver/processes/builtin/utils.py +++ b/weaver/processes/builtin/utils.py @@ -1,10 +1,14 @@ import os import tempfile -from typing import Any +from typing import TYPE_CHECKING from urllib.parse import urlparse +from weaver import WEAVER_ROOT_DIR from weaver.formats import ContentType, get_extension +if TYPE_CHECKING: + from typing import Any, Tuple + def is_netcdf_url(url): # type: (Any) -> bool @@ -49,3 +53,15 @@ def validate_reference(url, is_file): return if urlparse(url).scheme not in ["http", "https", "s3"]: raise ValueError(f"Not a valid file URL reference [{url}]. Scheme not supported.") + + +def get_package_details(file): + # type: (os.PathLike[str]) -> Tuple[str, str, str] + """ + Obtains the ``builtin`` process details from its file reference. + """ + name = os.path.split(os.path.splitext(file)[0])[-1] + root = WEAVER_ROOT_DIR.rstrip("/") # avoid double // + path = str(file).rsplit(f"{root}/", 1)[-1].rsplit(name)[0] + mod = f"{path}{name}".replace("/", ".") + return name, path, mod