From 30760746af9ad9f401216a2d7c78d8452bcccbec Mon Sep 17 00:00:00 2001
From: Francis Charette Migneault <francis.charette.migneault@gmail.com>
Date: Sat, 1 Apr 2023 02:59:57 -0400
Subject: [PATCH 1/5] security fixes and improvements to handle file paths

---
 CHANGES.rst                                   |  5 +-
 requirements-dev.txt                          |  1 +
 tests/functional/test_builtin.py              | 50 ++++++++++++++---
 tests/functional/test_cli.py                  |  4 +-
 tests/functional/test_workflow.py             | 16 ++++--
 tests/functional/utils.py                     |  2 +-
 tests/test_utils.py                           | 15 ++++++
 tests/utils.py                                | 25 +++++++++
 weaver/cli.py                                 |  2 +-
 weaver/processes/builtin/__init__.py          | 10 +++-
 weaver/processes/builtin/file2string_array.py | 17 ++++--
 .../processes/builtin/file_index_selector.py  | 17 ++++--
 weaver/processes/builtin/jsonarray2netcdf.py  | 35 +++++++-----
 weaver/processes/builtin/metalink2netcdf.cwl  |  3 ++
 weaver/processes/builtin/metalink2netcdf.py   | 24 +++++----
 weaver/processes/builtin/utils.py             | 34 ++++++++++--
 weaver/utils.py                               | 54 ++++++++++++++-----
 weaver/vault/utils.py                         |  9 ++--
 weaver/vault/views.py                         |  2 +-
 weaver/wps_restapi/jobs/utils.py              |  2 +
 20 files changed, 260 insertions(+), 67 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index a81358e2b..c6d388e26 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -12,7 +12,10 @@ Changes
 
 Changes:
 --------
-- No change.
+- Disallow ``builtin`` `Processes` expecting a user-provided input path to run with local file references such that
+  they must respect any configured server-side remote file access rules instead of bypassing security validations
+  through resolved local paths.
+- Add multiple validation checks for more secure file paths handling when retrieving contents from remote locations.
 
 Fixes:
 ------
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 6ef229beb..4ec0c7e43 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -22,6 +22,7 @@ parameterized
 pluggy>=0.7
 # FIXME: bad interpolation of 'setup.cfg' for pytest 'log_format' (https://github.com/pytest-dev/pytest/issues/10019)
 pytest<7
+pytest-server-fixtures
 pydocstyle
 pylint>=2.11,!=2.12,<2.14; python_version <= "3.6"
 pylint>=2.15.4; python_version >= "3.7"
diff --git a/tests/functional/test_builtin.py b/tests/functional/test_builtin.py
index 2e4067473..696ae2f66 100644
--- a/tests/functional/test_builtin.py
+++ b/tests/functional/test_builtin.py
@@ -7,7 +7,7 @@
 import pytest
 
 from tests.functional.utils import WpsConfigBase
-from tests.utils import get_settings_from_testapp, mocked_execute_celery, mocked_sub_requests
+from tests.utils import FileServer, get_settings_from_testapp, mocked_execute_celery, mocked_sub_requests
 from weaver.execute import ExecuteControlOption, ExecuteMode, ExecuteResponse, ExecuteTransmissionMode
 from weaver.formats import ContentType
 from weaver.processes.builtin import register_builtin_processes
@@ -32,6 +32,13 @@ def setUpClass(cls):
         cls.json_headers = {"Accept": ContentType.APP_JSON, "Content-Type": ContentType.APP_JSON}
         super(BuiltinAppTest, cls).setUpClass()
 
+        cls.file_server = FileServer()
+        cls.file_server.start()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.file_server.teardown()
+
     def setUp(self):
         # register builtin processes from scratch to have clean state
         self.process_store.clear_processes()
@@ -86,18 +93,23 @@ def test_jsonarray2netcdf_describe_ogc_schema(self):
         assert body["jobControlOptions"] == [ExecuteControlOption.ASYNC, ExecuteControlOption.SYNC]
         assert body["outputTransmission"] == [ExecuteTransmissionMode.REFERENCE, ExecuteTransmissionMode.VALUE]
 
-    def setup_inputs(self, stack):
-        dirname = tempfile.gettempdir()
+    def setup_inputs(self, stack, use_temp_file=False):
+        if use_temp_file:
+            dir_path = tempfile.gettempdir()
+            url_path = f"file://{dir_path}"
+        else:
+            dir_path = self.file_server.document_root
+            url_path = self.file_server.uri
         nc_data = "Hello NetCDF!"
-        tmp_ncdf = tempfile.NamedTemporaryFile(dir=dirname, mode="w", suffix=".nc")     # pylint: disable=R1732
-        tmp_json = tempfile.NamedTemporaryFile(dir=dirname, mode="w", suffix=".json")   # pylint: disable=R1732
+        tmp_ncdf = tempfile.NamedTemporaryFile(dir=dir_path, mode="w", suffix=".nc")     # pylint: disable=R1732
+        tmp_json = tempfile.NamedTemporaryFile(dir=dir_path, mode="w", suffix=".json")   # pylint: disable=R1732
         tmp_ncdf = stack.enter_context(tmp_ncdf)  # noqa
         tmp_json = stack.enter_context(tmp_json)  # noqa
         tmp_ncdf.write(nc_data)
         tmp_ncdf.seek(0)
-        tmp_json.write(json.dumps([f"file://{os.path.join(dirname, tmp_ncdf.name)}"]))
+        tmp_json.write(json.dumps([f"{url_path}/{os.path.basename(tmp_ncdf.name)}"]))
         tmp_json.seek(0)
-        body = {"inputs": [{"id": "input", "href": os.path.join(dirname, tmp_json.name)}]}
+        body = {"inputs": [{"id": "input", "href": f"{url_path}/{os.path.basename(tmp_json.name)}"}]}
         return body, nc_data
 
     def validate_results(self, results, outputs, data, links):
@@ -147,6 +159,30 @@ def validate_results(self, results, outputs, data, links):
         nc_real_path = nc_href.replace(wps_out, wps_dir)
         assert os.path.split(nc_real_path)[-1] == os.path.split(nc_href)[-1]
 
+    def test_jsonarray2netcdf_execute_invalid_file_local(self):
+        """
+        Validate that local file path as input is not permitted anymore.
+        """
+        with contextlib.ExitStack() as stack_exec:
+            body, nc_data = self.setup_inputs(stack_exec, use_temp_file=True)
+            body.update({
+                "mode": ExecuteMode.ASYNC,
+                "response": ExecuteResponse.DOCUMENT,
+                "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.VALUE}],
+            })
+            for mock_exec in mocked_execute_celery():
+                stack_exec.enter_context(mock_exec)
+            path = "/processes/jsonarray2netcdf/jobs"
+            resp = mocked_sub_requests(self.app, "post_json", path,
+                                       data=body, headers=self.json_headers, only_local=True)
+        assert resp.status_code == 201
+
+        job_url = resp.json["location"]
+        job_res = self.monitor_job(job_url, expect_failed=True)
+        assert job_res["status"] == Status.FAILED
+        job_logs = self.app.get(f"{job_url}/logs").json
+        assert any("ValueError: Not a valid file URL reference" in log for log in job_logs)
+
     def test_jsonarray2netcdf_execute_async(self):
         with contextlib.ExitStack() as stack_exec:
             body, nc_data = self.setup_inputs(stack_exec)
diff --git a/tests/functional/test_cli.py b/tests/functional/test_cli.py
index a80df5cd6..5f345396a 100644
--- a/tests/functional/test_cli.py
+++ b/tests/functional/test_cli.py
@@ -459,7 +459,7 @@ def test_execute_manual_monitor_status_and_download_results(self):
         Test a typical case of :term:`Job` execution, result retrieval and download, but with manual monitoring.
 
         Manual monitoring can be valid in cases where a *very* long :term:`Job` must be executed, and the user does
-        not intend to wait after it. This avoids leaving some shell/notebook/etc. open of a long time and provide a
+        not intend to wait for it. This avoids leaving some shell/notebook/etc. open of a long time and provide a
         massive ``timeout`` value. Instead, the user can simply re-call :meth:`WeaverClient.monitor` at a later time
         to resume monitoring. Other situation can be if the connection was dropped or script runner crashed, and the
         want to pick up monitoring again.
@@ -480,7 +480,7 @@ def test_execute_manual_monitor_status_and_download_results(self):
         assert result.body.get("status") == Status.SUCCEEDED
         links = result.body.get("links")
         assert isinstance(links, list)
-        assert len(list(filter(lambda _link: _link["rel"].endswith("results"), links))) == 1
+        assert len([_link for _link in links if _link["rel"].endswith("results")]) == 1
 
         # first test to get job results details, but not downloading yet
         result = mocked_sub_requests(self.app, self.client.results, job_id)
diff --git a/tests/functional/test_workflow.py b/tests/functional/test_workflow.py
index 453ff5bfd..a73a9bfe7 100644
--- a/tests/functional/test_workflow.py
+++ b/tests/functional/test_workflow.py
@@ -15,6 +15,7 @@
 
 import mock
 import pytest
+from pytest_server_fixtures.http import SimpleHTTPTestServer
 from pyramid import testing
 from pyramid.httpexceptions import HTTPConflict, HTTPCreated, HTTPNotFound, HTTPOk
 from pyramid.settings import asbool
@@ -992,14 +993,21 @@ def test_workflow_mixed_rest_builtin_wps1_docker_select_requirements(self):
 
         with contextlib.ExitStack() as stack:
             tmp_host = "https://mocked-file-server.com"  # must match in 'Execute_WorkflowSelectCopyNestedOutDir.json'
-            tmp_dir = stack.enter_context(tempfile.TemporaryDirectory())
+            # jsonarray2netcdf cannot use local paths anymore for nested NetCDF, provide them through tmp file server
+            file_server = stack.enter_context(SimpleHTTPTestServer())
+            file_server.start()
+            nc_dir = file_server.document_root
+            nc_url = file_server.uri
             nc_refs = []
             for i in range(3):
                 nc_name = f"test-file-{i}.nc"
-                nc_path = os.path.join(tmp_dir, nc_name)
-                nc_refs.append(f"file://{nc_path}")
-                with open(os.path.join(tmp_dir, nc_name), mode="w", encoding="utf-8") as tmp_file:
+                nc_path = os.path.join(nc_dir, nc_name)
+                nc_href = f"{nc_url}/{nc_name}"
+                nc_refs.append(nc_href)
+                with open(nc_path, mode="w", encoding="utf-8") as tmp_file:
                     tmp_file.write(f"DUMMY NETCDF DATA #{i}")
+
+            tmp_dir = stack.enter_context(tempfile.TemporaryDirectory())
             with open(os.path.join(tmp_dir, "netcdf-array.json"), mode="w", encoding="utf-8") as tmp_file:
                 json.dump(nc_refs, tmp_file)  # must match execution body
 
diff --git a/tests/functional/utils.py b/tests/functional/utils.py
index b11c00e40..f6be09217 100644
--- a/tests/functional/utils.py
+++ b/tests/functional/utils.py
@@ -164,7 +164,7 @@ def retrieve_payload(cls, process, ref_type=None, ref_name=None, ref_found=False
             if local:
                 var_locations = [APP_PKG_ROOT]
             else:
-                base_url = "https://raw.githubusercontent.com/"
+                base_url = "https://raw.githubusercontent.com"
                 var_locations = list(dict.fromkeys([  # don't use set to preserve this prioritized order
                     APP_PKG_ROOT,
                     os.getenv("TEST_GITHUB_SOURCE_URL"),
diff --git a/tests/test_utils.py b/tests/test_utils.py
index c22432f1d..7c4b325af 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -67,6 +67,7 @@
     get_sane_name,
     get_secure_directory_name,
     get_secure_filename,
+    get_secure_path,
     get_ssl_verify_option,
     get_url_without_query,
     is_update_version,
@@ -805,6 +806,20 @@ def mock_uuid():
         assert result == fake_uuid
 
 
+@pytest.mark.parametrize(
+    ["test_path", "expect_path"],
+    [
+        ("/tmp/././/../.././test.txt", "/tmp/test.txt"),
+        ("./../../../../../tmp/test.txt", "tmp/test.txt"),
+        ("file://./../../../../../tmp/test.txt", "file://tmp/test.txt"),
+    ]
+)
+def test_get_secure_path(test_path, expect_path):
+    # type: (str, str) -> None
+    result = get_secure_path(test_path)
+    assert result == expect_path
+
+
 @pytest.mark.parametrize(
     [
         "include_dir_heading",
diff --git a/tests/utils.py b/tests/utils.py
index 810071dcd..389f4e2f7 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -32,6 +32,7 @@
 from pyramid.httpexceptions import HTTPException, HTTPNotFound, HTTPUnprocessableEntity
 from pyramid.registry import Registry
 from pyramid.testing import DummyRequest
+from pytest_server_fixtures.http import HTTPTestServer, SimpleHTTPTestServer
 from requests import Response
 from webtest import TestApp, TestResponse
 
@@ -845,6 +846,30 @@ def mocked_dir_listing(local_directory,             # type: str
     return dir_html
 
 
+class FileServer(SimpleHTTPTestServer):
+    """
+    Generate a file server that can host files under :attr:`document_root`.
+
+    Contrary to :func:`mocked_file_server` where requests are captured and redirected to the corresponding files,
+    this server receives *real* requests (via a socket) and returns matched files. This is particularly important
+    during tests that call subprocesses independently of the main Python test process
+    (e.g.: :term:`CWL` ``CommandLineTool` for a ``Builtin`` :term:`Process`), because mocks applied in the main
+    process are not reflected in the other ones.
+
+    .. warning::
+        This server takes more time to start than usual mocks. Use it sparingly, and consider maintaining a single
+         instance over multiple tests of a complete test suite rather than recreating a server for each test.
+    """
+    def __init__(self):
+        self._port = self.get_port()
+        self._uri = f"http://0.0.0.0:{self._port}"
+
+        # purposely call 'HTTPTestServer' instead of 'SimpleHTTPTestServer' to enforce the URI
+        # otherwise, 'socket.gethostname()' is used (machine name), and the obtained URI fail our schema validation
+        HTTPTestServer.__init__(self, hostname="0.0.0.0", port=self._port, uri=self._uri)
+        self.cwd = self.document_root
+
+
 def mocked_file_server(directory,                   # type: str
                        url,                         # type: str
                        settings,                    # type: SettingsType
diff --git a/weaver/cli.py b/weaver/cli.py
index a908da28c..3e9baa13a 100644
--- a/weaver/cli.py
+++ b/weaver/cli.py
@@ -1010,7 +1010,7 @@ def execute(self,
             Execution requests are always accomplished asynchronously. To obtain the final :term:`Job` status as if
             they were executed synchronously, provide the :paramref:`monitor` argument. This offers more flexibility
             over servers that could decide to ignore sync/async preferences, and avoids closing/timeout connection
-            errors that could occur for long running processes, since status is pooled iteratively rather than waiting.
+            errors that could occur for long-running processes, since status is pooled iteratively rather than waiting.
 
         :param process_id: Identifier of the local or remote process to execute.
         :param provider_id: Identifier of the provider from which to locate a remote process to execute.
diff --git a/weaver/processes/builtin/__init__.py b/weaver/processes/builtin/__init__.py
index 3932aad24..00225813a 100644
--- a/weaver/processes/builtin/__init__.py
+++ b/weaver/processes/builtin/__init__.py
@@ -32,7 +32,7 @@
     from cwltool.context import RuntimeContext
     from cwltool.pathmapper import PathMapper
 
-    from weaver.typedefs import AnyRegistryContainer, CWL, CWL_RequirementsList, JSON, TypedDict
+    from weaver.typedefs import AnySettingsContainer, CWL, CWL_RequirementsList, JSON, TypedDict
 
     BuiltinResourceMap = TypedDict("BuiltinResourceMap", {
         "package": str,
@@ -123,11 +123,17 @@ def _get_builtin_package(process_id, package):
 
 
 def register_builtin_processes(container):
-    # type: (AnyRegistryContainer) -> None
+    # type: (AnySettingsContainer) -> None
     """
     Registers every ``builtin`` CWL package to the processes database.
 
     CWL definitions must be located within the :mod:`weaver.processes.builtin` module.
+
+    .. note::
+        Although any settings can be provided as input, it is better to specify a
+        :class:`pyramid.registry.Registry` or :class:`pyramid.request.Request` instance
+        in order to retrieve any pre-established database connection stored as reference.
+        Specifying configuration settings will force recreation of a new database connection.
     """
     restapi_url = get_wps_restapi_base_url(container)
     builtin_apps_mapping = _get_builtin_reference_mapping(os.path.abspath(os.path.dirname(__file__)))
diff --git a/weaver/processes/builtin/file2string_array.py b/weaver/processes/builtin/file2string_array.py
index 2e9cbdad3..5bbc7e5c4 100644
--- a/weaver/processes/builtin/file2string_array.py
+++ b/weaver/processes/builtin/file2string_array.py
@@ -9,16 +9,26 @@
 import sys
 
 CUR_DIR = os.path.abspath(os.path.dirname(__file__))
+sys.path.insert(0, CUR_DIR)
+# root to allow 'from weaver import <...>'
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(CUR_DIR))))
+
+# place weaver specific imports after sys path fixing to ensure they are found from external call
+# pylint: disable=C0413,wrong-import-order
+from weaver import WEAVER_ROOT_DIR  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import validate_file_reference  # isort:skip # noqa: E402
 
 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
+PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0]
+PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
 
 # setup logger since it is not run from the main 'weaver' app
-LOGGER = logging.getLogger(__name__)
+LOGGER = logging.getLogger(PACKAGE_MODULE)
 LOGGER.addHandler(logging.StreamHandler(sys.stdout))
 LOGGER.setLevel(logging.INFO)
 
 # process details
-__version__ = "1.2"
+__version__ = "1.3"
 __title__ = "File to String-Array"
 __abstract__ = __doc__  # NOTE: '__doc__' is fetched directly, this is mostly to be informative
 
@@ -26,8 +36,9 @@
 
 
 def process(input_file, output_dir):
-    # type: (argparse.FileType, str) -> None
+    # type: (str, str) -> None
     LOGGER.info("Got arguments: input_file=%s output_dir=%s", input_file, output_dir)
+    validate_file_reference(input_file)
     output_data = {"output": [input_file]}
     with open(os.path.join(output_dir, OUTPUT_CWL_JSON), mode="w", encoding="utf-8") as file:
         return json.dump(output_data, file)
diff --git a/weaver/processes/builtin/file_index_selector.py b/weaver/processes/builtin/file_index_selector.py
index 37b6fcc22..93fb96bb6 100644
--- a/weaver/processes/builtin/file_index_selector.py
+++ b/weaver/processes/builtin/file_index_selector.py
@@ -6,7 +6,6 @@
 import argparse
 import logging
 import os
-import shutil
 import sys
 from typing import TYPE_CHECKING
 
@@ -18,15 +17,23 @@
 # root to allow 'from weaver import <...>'
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(CUR_DIR))))
 
+# place weaver specific imports after sys path fixing to ensure they are found from external call
+# pylint: disable=C0413,wrong-import-order
+from weaver import WEAVER_ROOT_DIR  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import validate_file_reference  # isort:skip # noqa: E402
+from weaver.utils import OutputMethod, fetch_file  # isort:skip # noqa: E402
+
 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
+PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0]
+PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
 
 # setup logger since it is not run from the main 'weaver' app
-LOGGER = logging.getLogger(__name__)
+LOGGER = logging.getLogger(PACKAGE_MODULE)
 LOGGER.addHandler(logging.StreamHandler(sys.stdout))
 LOGGER.setLevel(logging.INFO)
 
 # process details
-__version__ = "1.1"
+__version__ = "1.2"
 __title__ = "File Index Selector"
 __abstract__ = __doc__  # NOTE: '__doc__' is fetched directly, this is mostly to be informative
 
@@ -38,7 +45,9 @@ def select(files, index, output_dir):
     try:
         if not os.path.isdir(output_dir):
             raise ValueError(f"Output dir [{output_dir}] does not exist.")
-        shutil.copy(files[index], output_dir)
+        file_path = files[index]
+        validate_file_reference(file_path)
+        fetch_file(file_path, output_dir, out_method=OutputMethod.COPY)
     except Exception as exc:
         # log only debug for tracking, re-raise and actual error wil be logged by top process monitor
         LOGGER.debug("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc)
diff --git a/weaver/processes/builtin/jsonarray2netcdf.py b/weaver/processes/builtin/jsonarray2netcdf.py
index 0b45985de..366d8d926 100644
--- a/weaver/processes/builtin/jsonarray2netcdf.py
+++ b/weaver/processes/builtin/jsonarray2netcdf.py
@@ -17,20 +17,24 @@
 
 # place weaver specific imports after sys path fixing to ensure they are found from external call
 # pylint: disable=C0413,wrong-import-order
-from weaver.processes.builtin.utils import is_netcdf_url  # isort:skip # noqa: E402
-from weaver.utils import fetch_file  # isort:skip # noqa: E402
+from weaver import WEAVER_ROOT_DIR  # isort:skip # noqa: E402
+from weaver.formats import repr_json  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import is_netcdf_url, validate_file_reference  # isort:skip # noqa: E402
+from weaver.utils import fetch_file, get_secure_path  # isort:skip # noqa: E402
 
 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
+PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0]
+PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
 
 # setup logger since it is not run from the main 'weaver' app
-LOGGER = logging.getLogger(PACKAGE_NAME)
+LOGGER = logging.getLogger(PACKAGE_MODULE)
 _handler = logging.StreamHandler(sys.stdout)  # noqa
 _handler.setFormatter(logging.Formatter(fmt="[%(name)s] %(levelname)-8s %(message)s"))
 LOGGER.addHandler(_handler)
-LOGGER.setLevel(logging.DEBUG)
+LOGGER.setLevel(logging.INFO)
 
 # process details
-__version__ = "2.2"
+__version__ = "2.3"
 __title__ = "JSON array to NetCDF"
 __abstract__ = __doc__  # NOTE: '__doc__' is fetched directly, this is mostly to be informative
 
@@ -38,24 +42,31 @@
 def j2n(json_reference, output_dir):
     # type: (str, str) -> None
     LOGGER.info("Process '%s' execution starting...", PACKAGE_NAME)
-    LOGGER.debug("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir)
+    LOGGER.info("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir)
     try:
         if not os.path.isdir(output_dir):
             raise ValueError(f"Output dir [{output_dir}] does not exist.")
         with TemporaryDirectory(prefix=f"wps_process_{PACKAGE_NAME}_") as tmp_dir:
-            LOGGER.debug("Fetching JSON file: [%s]", json_reference)
+            LOGGER.info("Verify URL reference: [%s]", json_reference)
+            validate_file_reference(json_reference)
+            LOGGER.info("Fetching JSON file: [%s]", json_reference)
             json_path = fetch_file(json_reference, tmp_dir, timeout=10, retry=3)
-            LOGGER.debug("Reading JSON file: [%s]", json_path)
+            json_path = get_secure_path(json_path)
+            LOGGER.info("Reading JSON file: [%s]", json_path)
             with open(json_path, mode="r", encoding="utf-8") as json_file:
                 json_content = json.load(json_file)
-            if not isinstance(json_content, list) or any(not is_netcdf_url(f) for f in json_content):
+            if not isinstance(json_content, list) or any(not isinstance(item, str) for item in json_content):
                 LOGGER.error("Invalid JSON: [%s]", json_content)
                 raise ValueError("Invalid JSON file format, expected a plain array of NetCDF file URL strings.")
-            LOGGER.debug("Parsing JSON file references.")
+            LOGGER.info("Parsing JSON file references from file contents:\n%s", repr_json(json_content))
             for file_url in json_content:
-                LOGGER.debug("Fetching NetCDF reference from JSON file: [%s]", file_url)
+                LOGGER.info("Validate NetCDF reference from JSON file: [%s]", file_url)
+                validate_file_reference(file_url)
+                if not is_netcdf_url(file_url):
+                    raise ValueError(f"Invalid file format for [{file_url}], expected a NetCDF file URL.")
+                LOGGER.info("Fetching NetCDF reference from JSON file: [%s]", file_url)
                 fetched_nc = fetch_file(file_url, output_dir, timeout=10, retry=3)
-                LOGGER.debug("Fetched NetCDF output location: [%s]", fetched_nc)
+                LOGGER.info("Fetched NetCDF output location: [%s]", fetched_nc)
     except Exception as exc:
         LOGGER.error("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc)
         raise
diff --git a/weaver/processes/builtin/metalink2netcdf.cwl b/weaver/processes/builtin/metalink2netcdf.cwl
index e962ff71d..e433e4c63 100644
--- a/weaver/processes/builtin/metalink2netcdf.cwl
+++ b/weaver/processes/builtin/metalink2netcdf.cwl
@@ -10,6 +10,9 @@ inputs:
    inputBinding:
      position: 1
      prefix: "-i"
+   format:
+     - iana:application/metalink+xml
+     - iana:application/metalink4+xml
  index:
    type: int
    inputBinding:
diff --git a/weaver/processes/builtin/metalink2netcdf.py b/weaver/processes/builtin/metalink2netcdf.py
index dc3994c4b..df3000ed0 100644
--- a/weaver/processes/builtin/metalink2netcdf.py
+++ b/weaver/processes/builtin/metalink2netcdf.py
@@ -16,18 +16,21 @@
 
 # place weaver specific imports after sys path fixing to ensure they are found from external call
 # pylint: disable=C0413,wrong-import-order
-from weaver import xml_util  # isort:skip # noqa: E402
+from weaver import WEAVER_ROOT_DIR, xml_util  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import validate_file_reference  # isort:skip # noqa: E402
 from weaver.utils import fetch_file  # isort:skip # noqa: E402
 
 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
+PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0]
+PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
 
 # setup logger since it is not run from the main 'weaver' app
-LOGGER = logging.getLogger(__name__)
+LOGGER = logging.getLogger(PACKAGE_MODULE)
 LOGGER.addHandler(logging.StreamHandler(sys.stdout))
 LOGGER.setLevel(logging.INFO)
 
 # process details
-__version__ = "1.2"
+__version__ = "1.3"
 __title__ = "Metalink to NetCDF"
 __abstract__ = __doc__  # NOTE: '__doc__' is fetched directly, this is mostly to be informative
 
@@ -38,23 +41,24 @@ def m2n(metalink_reference, index, output_dir):
         "Got arguments: metalink_reference=%s index=%s output_dir=%s", metalink_reference, index, output_dir
     )
     LOGGER.info("Process '%s' execution starting...", PACKAGE_NAME)
-    LOGGER.debug("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir)
+    LOGGER.info("Process '%s' output directory: [%s].", PACKAGE_NAME, output_dir)
     try:
         if not os.path.isdir(output_dir):
             raise ValueError(f"Output dir [{output_dir}] does not exist.")
+        validate_file_reference(metalink_reference)
         with TemporaryDirectory(prefix=f"wps_process_{PACKAGE_NAME}_") as tmp_dir:
-            LOGGER.debug("Fetching Metalink file: [%s]", metalink_reference)
+            LOGGER.info("Fetching Metalink file: [%s]", metalink_reference)
             metalink_path = fetch_file(metalink_reference, tmp_dir, timeout=10, retry=3)
-            LOGGER.debug("Reading Metalink file: [%s]", metalink_path)
+            LOGGER.info("Reading Metalink file: [%s]", metalink_path)
             xml_data = xml_util.parse(metalink_path)
-            LOGGER.debug("Parsing Metalink file references.")
+            LOGGER.info("Parsing Metalink file references.")
             nc_file_url = xml_data.xpath(f"string(//metalink/file[{index}]/metaurl)")
-            LOGGER.debug("Fetching NetCDF reference from Metalink file: [%s]", metalink_reference)
-            LOGGER.debug("NetCDF file URL : %s", nc_file_url)
+            LOGGER.info("Fetching NetCDF reference from Metalink file: [%s]", metalink_reference)
+            LOGGER.info("NetCDF file URL : %s", nc_file_url)
             fetch_file(nc_file_url, output_dir)
     except Exception as exc:
         # log only debug for tracking, re-raise and actual error wil be logged by top process monitor
-        LOGGER.debug("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc)
+        LOGGER.info("Process '%s' raised an exception: [%s]", PACKAGE_NAME, exc)
         raise
     LOGGER.info("Process '%s' execution completed.", PACKAGE_NAME)
 
diff --git a/weaver/processes/builtin/utils.py b/weaver/processes/builtin/utils.py
index b552ba094..7ec9a863f 100644
--- a/weaver/processes/builtin/utils.py
+++ b/weaver/processes/builtin/utils.py
@@ -7,8 +7,36 @@
 
 def is_netcdf_url(url):
     # type: (Any) -> bool
-    if not isinstance(url, str):
-        return False
-    if urlparse(url).scheme == "":
+    """
+    Validates that the reference is a remote NetCDF file reference.
+    """
+    try:
+        validate_file_reference(url)
+    except (TypeError, ValueError):
         return False
     return os.path.splitext(url)[-1] == get_extension(ContentType.APP_NETCDF)
+
+
+def validate_file_reference(url):
+    # type: (str) -> None
+    """
+    Ensures that the provided reference points to a valide remote file or a temporary :term:`CWL` intermediate file.
+
+    In order to avoid bypassing security validation of server file access between jobs, remote locations must be
+    enforced. However, :term:`CWL` temporary files must be allowed through for intermediate locations passed around
+    between :term:`Workflow` steps.
+    """
+    if not isinstance(url, str):
+        raise TypeError(f"Not a valid URL: [{url!s}]")
+    if url.endswith("/"):
+        raise ValueError(f"Not a valid file URL reference [{url}]. Directory not supported.")
+    cwl_files = [
+        "file:///tmp/cwltool_out_",
+        "file:///tmp/cwltool_tmp_",
+        "/tmp/cwltool_out_",
+        "/tmp/cwltool_tmp_",
+    ]
+    if any(url.startswith(path) for path in cwl_files):
+        return
+    if urlparse(url).scheme not in ["http", "https", "s3"]:
+        raise ValueError(f"Not a valid file URL reference [{url}]. Scheme not supported.")
diff --git a/weaver/utils.py b/weaver/utils.py
index 18cc4cfbe..c0911c0cd 100644
--- a/weaver/utils.py
+++ b/weaver/utils.py
@@ -1146,6 +1146,7 @@ def get_href_headers(path,                      # type: str
     """
     Obtain headers applicable for the provided file or directory reference.
 
+    :rtype: object
     :param path: File to describe. Either a local path or remote URL.
     :param download_headers:
         If enabled, add the ``Content-Disposition`` header with attachment filename for downloading the file.
@@ -2044,6 +2045,29 @@ def get_secure_directory_name(location):
     return unique_directory_name
 
 
+def get_secure_path(location):
+    # type: (str) -> str
+    """
+    Obtain a secure path location with validation of each nested component.
+    """
+    # consider path with potential scheme
+    parts = location.split("://", 1)
+    if len(parts) > 1:
+        scheme, ref = parts
+    else:
+        scheme, ref = None, parts[0]
+
+    # validate parts
+    parts = ref.split("/")
+    for i, part in enumerate(parts):
+        parts[i] = get_secure_filename(part)
+    start = "/" if ref.startswith("/") else ""
+    trail = "/" if ref.endswith("/") and ref != "/" else ""
+    secure_ref = start + "/".join(path for path in parts if path) + trail
+    secure_loc = f"{scheme}://{secure_ref}" if scheme else secure_ref
+    return secure_loc
+
+
 def download_file_http(file_reference, file_outdir, settings=None, callback=None, **request_kwargs):
     # type: (str, str, Optional[AnySettingsContainer], Optional[Callable[[str], None]], **Any) -> str
     """
@@ -2421,37 +2445,38 @@ def fetch_file(file_reference,                      # type: str
     :raises HTTPException: applicable HTTP-based exception if any occurred during the operation.
     :raises ValueError: when the reference scheme cannot be identified.
     """
-    if file_reference.startswith("file://"):
-        file_reference = file_reference[7:]
-    file_href = file_reference
-    file_name = os.path.basename(os.path.realpath(file_reference))  # resolve any different name to use the original
+    file_href = file_reference  # keep original for reporting in case of error
+    if file_href.startswith("file://"):
+        file_href = file_href[7:]
+    file_name = os.path.basename(os.path.realpath(file_href))  # resolve any different name to use the original
     file_name = get_secure_filename(file_name)
     file_path = os.path.join(file_outdir, file_name)
     LOGGER.debug("Fetching file reference: [%s] using options:\n%s", file_href, repr_json(option_kwargs))
     options, kwargs = resolve_scheme_options(**option_kwargs)
-    if os.path.isfile(file_reference):
+    if os.path.isfile(file_href):
         LOGGER.debug("Fetch file resolved as local reference.")
+        file_href = get_secure_path(file_href)
         file_path = adjust_file_local(file_href, file_outdir, out_method)
-    elif file_reference.startswith("s3://"):
+    elif file_href.startswith("s3://"):
         LOGGER.debug("Fetch file resolved as S3 bucket reference.")
         s3_params = resolve_s3_http_options(**options["http"], **kwargs)
         s3_region = options["s3"].pop("region_name", None)
-        s3_bucket, file_key, s3_region_ref = resolve_s3_reference(file_reference)
+        s3_bucket, file_key, s3_region_ref = resolve_s3_reference(file_href)
         if s3_region and s3_region_ref and s3_region != s3_region_ref:
             raise ValueError("Invalid AWS S3 reference. "
                              f"Input region name [{s3_region}] mismatches reference region [{s3_region_ref}].")
         s3_region = s3_region_ref or s3_region
         s3_client = boto3.client("s3", region_name=s3_region, **s3_params)  # type: S3Client
         s3_client.download_file(s3_bucket, file_key, file_path, Callback=callback)
-    elif file_reference.startswith("http"):
+    elif file_href.startswith("http"):
         # pseudo-http URL referring to S3 bucket, try to redirect to above S3 handling method if applicable
-        if file_reference.startswith("https://s3.") or urlparse(file_reference).hostname.endswith(".amazonaws.com"):
+        if file_href.startswith("https://s3.") or urlparse(file_href).hostname.endswith(".amazonaws.com"):
             LOGGER.debug("Detected HTTP-like S3 bucket file reference. Retrying file fetching with S3 reference.")
-            s3_ref, s3_region = resolve_s3_from_http(file_reference)
+            s3_ref, s3_region = resolve_s3_from_http(file_href)
             option_kwargs.pop("s3_region", None)
             return fetch_file(s3_ref, file_outdir, settings=settings, s3_region_name=s3_region, **option_kwargs)
         file_path = download_file_http(
-            file_reference,
+            file_href,
             file_outdir,
             settings=settings,
             callback=callback,
@@ -2459,7 +2484,7 @@ def fetch_file(file_reference,                      # type: str
             **kwargs
         )
     else:
-        scheme = file_reference.split("://")
+        scheme = file_reference.split("://", 1)
         scheme = "<none>" if len(scheme) < 2 else scheme[0]
         raise ValueError(
             f"Unresolved location and/or fetch file scheme: '{scheme!s}', "
@@ -3122,13 +3147,16 @@ def adjust_directory_local(location,                            # type: Path
         LOGGER.debug("Local directory reference [%s] matches output, but desired listing differs. "
                      "Removing additional items:\n%s", loc_dir, repr_json(extras))
         for file_path in extras:
-            os.remove(file_path)
+            file_path = get_secure_path(file_path)
+            if os.path.isfile(file_path):
+                os.remove(file_path)
         return filtered
 
     # Any operation (islink, remove, etc.) that must operate on the link itself rather than the directory it points
     # to must not have the final '/' in the path. Otherwise, the link path (without final '/') is resolved before
     # evaluating the operation, which make them attempt their call on the real directory itself.
     link_dir = location.rstrip("/")
+    link_dir = get_secure_path(link_dir)
 
     if (os.path.exists(out_dir) and not os.path.isdir(out_dir)) or (os.path.isdir(out_dir) and os.listdir(out_dir)):
         LOGGER.debug("References under [%s] cannot be placed under target path [%s] "
diff --git a/weaver/vault/utils.py b/weaver/vault/utils.py
index 8e9b9efca..29cce8663 100644
--- a/weaver/vault/utils.py
+++ b/weaver/vault/utils.py
@@ -13,7 +13,7 @@
 from weaver.datatype import VaultFile
 from weaver.formats import repr_json
 from weaver.store.base import StoreVault
-from weaver.utils import get_header, get_settings, get_weaver_url, is_uuid
+from weaver.utils import get_header, get_secure_path, get_settings, get_weaver_url, is_uuid
 from weaver.wps_restapi import swagger_definitions as sd
 
 if TYPE_CHECKING:
@@ -54,7 +54,9 @@ def get_vault_path(file, container=None):
     Get the full path of the vault file.
     """
     vault_dir = get_vault_dir(container)
-    return os.path.join(vault_dir, file.name)
+    vault_path = os.path.join(vault_dir, file.name)
+    vault_path = get_secure_path(vault_path)
+    return vault_path
 
 
 def get_vault_url(file, container=None):
@@ -269,4 +271,5 @@ def decrypt_from_vault(vault_file, path, out_dir=None, delete_encrypted=False):
         out_file.seek(0)
         if delete_encrypted:
             os.remove(path)
-    return out_file.name
+    vault_path = get_secure_path(out_file.name)
+    return vault_path
diff --git a/weaver/vault/views.py b/weaver/vault/views.py
index 1a410696b..a80b8de96 100644
--- a/weaver/vault/views.py
+++ b/weaver/vault/views.py
@@ -127,7 +127,7 @@ def describe_file(request):
         )
         headers["Content-Location"] = get_vault_url(vault_file, request)
     finally:
-        if os.path.isfile(tmp_file):
+        if tmp_file and os.path.isfile(tmp_file):
             os.remove(tmp_file)
     return HTTPHeadFileResponse(code=200, headers=headers)
 
diff --git a/weaver/wps_restapi/jobs/utils.py b/weaver/wps_restapi/jobs/utils.py
index 7accb0a33..445a7fe35 100644
--- a/weaver/wps_restapi/jobs/utils.py
+++ b/weaver/wps_restapi/jobs/utils.py
@@ -44,6 +44,7 @@
     get_href_headers,
     get_path_kvp,
     get_sane_name,
+    get_secure_path,
     get_settings,
     get_weaver_url,
     is_uuid
@@ -295,6 +296,7 @@ def make_result_link(result_id, result, job_id, settings):
             loc = os.path.join(str(job_id), f"{result_id}{suffix}.txt")
             url = f"{wps_url}/{loc}"
             path = os.path.join(out, loc)
+            path = get_secure_path(path)
             with open(path, mode="w", encoding=enc) as out_file:
                 out_file.write(val)
         else:

From b91dd8431cbe405639c949eb0318920e89c09ac0 Mon Sep 17 00:00:00 2001
From: Francis Charette Migneault <francis.charette.migneault@gmail.com>
Date: Fri, 3 Nov 2023 13:10:22 -0400
Subject: [PATCH 2/5] fix linting + update versions of modified builtin
 processes

---
 tests/functional/test_builtin.py                | 2 +-
 tests/utils.py                                  | 6 +++---
 weaver/processes/builtin/file2string_array.py   | 2 +-
 weaver/processes/builtin/file_index_selector.py | 4 ++--
 weaver/processes/builtin/jsonarray2netcdf.py    | 2 +-
 weaver/processes/builtin/metalink2netcdf.py     | 4 ++--
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/functional/test_builtin.py b/tests/functional/test_builtin.py
index 54f990e2a..95b22e95d 100644
--- a/tests/functional/test_builtin.py
+++ b/tests/functional/test_builtin.py
@@ -165,7 +165,7 @@ def test_jsonarray2netcdf_execute_invalid_file_local(self):
         Validate that local file path as input is not permitted anymore.
         """
         with contextlib.ExitStack() as stack_exec:
-            body, nc_data = self.setup_inputs(stack_exec, use_temp_file=True)
+            body, _ = self.setup_inputs(stack_exec, use_temp_file=True)
             body.update({
                 "mode": ExecuteMode.ASYNC,
                 "response": ExecuteResponse.DOCUMENT,
diff --git a/tests/utils.py b/tests/utils.py
index dc081697b..31d3edcc9 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -884,13 +884,13 @@ class FileServer(SimpleHTTPTestServer):
         This server takes more time to start than usual mocks. Use it sparingly, and consider maintaining a single
          instance over multiple tests of a complete test suite rather than recreating a server for each test.
     """
-    def __init__(self):
+    def __init__(self):  # pylint: disable=W0231
         self._port = self.get_port()
         self._uri = f"http://0.0.0.0:{self._port}"
 
         # purposely call 'HTTPTestServer' instead of 'SimpleHTTPTestServer' to enforce the URI
-        # otherwise, 'socket.gethostname()' is used (machine name), and the obtained URI fail our schema validation
-        HTTPTestServer.__init__(self, hostname="0.0.0.0", port=self._port, uri=self._uri)
+        # otherwise, 'socket.gethostname()' is used (machine name), and the obtained URI fails our schema validation
+        HTTPTestServer.__init__(self, hostname="0.0.0.0", port=self._port, uri=self._uri)  # pylint: disable=W0233
         self.cwd = self.document_root
 
 
diff --git a/weaver/processes/builtin/file2string_array.py b/weaver/processes/builtin/file2string_array.py
index 5bbc7e5c4..858384843 100644
--- a/weaver/processes/builtin/file2string_array.py
+++ b/weaver/processes/builtin/file2string_array.py
@@ -19,7 +19,7 @@
 from weaver.processes.builtin.utils import validate_file_reference  # isort:skip # noqa: E402
 
 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
-PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0]
+PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0]
 PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
 
 # setup logger since it is not run from the main 'weaver' app
diff --git a/weaver/processes/builtin/file_index_selector.py b/weaver/processes/builtin/file_index_selector.py
index 6a8d7b2f0..37b245426 100644
--- a/weaver/processes/builtin/file_index_selector.py
+++ b/weaver/processes/builtin/file_index_selector.py
@@ -24,7 +24,7 @@
 from weaver.utils import OutputMethod, fetch_file  # isort:skip # noqa: E402
 
 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
-PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0]
+PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0]
 PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
 
 # setup logger since it is not run from the main 'weaver' app
@@ -33,7 +33,7 @@
 LOGGER.setLevel(logging.INFO)
 
 # process details
-__version__ = "1.2"
+__version__ = "1.3"
 __title__ = "File Index Selector"
 __abstract__ = __doc__  # NOTE: '__doc__' is fetched directly, this is mostly to be informative
 
diff --git a/weaver/processes/builtin/jsonarray2netcdf.py b/weaver/processes/builtin/jsonarray2netcdf.py
index 5a022a69c..ce9edc2f8 100644
--- a/weaver/processes/builtin/jsonarray2netcdf.py
+++ b/weaver/processes/builtin/jsonarray2netcdf.py
@@ -23,7 +23,7 @@
 from weaver.utils import fetch_file, get_secure_path  # isort:skip # noqa: E402
 
 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
-PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0]
+PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0]
 PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
 
 # setup logger since it is not run from the main 'weaver' app
diff --git a/weaver/processes/builtin/metalink2netcdf.py b/weaver/processes/builtin/metalink2netcdf.py
index 106ec11a0..f31c49a4f 100644
--- a/weaver/processes/builtin/metalink2netcdf.py
+++ b/weaver/processes/builtin/metalink2netcdf.py
@@ -22,7 +22,7 @@
 from weaver.processes.builtin.utils import is_netcdf_url  # isort:skip # noqa: E402
 
 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
-PACKAGE_BASE = __file__.split(WEAVER_ROOT_DIR.rstrip("/") + "/")[-1].rsplit(PACKAGE_NAME)[0]
+PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0]
 PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
 
 # setup logger since it is not run from the main 'weaver' app
@@ -31,7 +31,7 @@
 LOGGER.setLevel(logging.INFO)
 
 # process details
-__version__ = "1.3"
+__version__ = "1.4"
 __title__ = "Metalink to NetCDF"
 __abstract__ = __doc__  # NOTE: '__doc__' is fetched directly, this is mostly to be informative
 

From 0c4abe3b2a6fe4b6a0c84a973f605ead1ff249cf Mon Sep 17 00:00:00 2001
From: Francis Charette Migneault <francis.charette.migneault@gmail.com>
Date: Fri, 3 Nov 2023 18:00:57 -0400
Subject: [PATCH 3/5] fix broken tests due to new builtin input file path
 validation

---
 CHANGES.rst                                   |  1 +
 tests/functional/test_builtin.py              |  9 ++-
 tests/functional/test_cli.py                  |  2 +-
 tests/functional/test_workflow.py             | 57 ++++++++++++-------
 tests/utils.py                                |  8 +--
 weaver/processes/builtin/file2string_array.py |  4 +-
 .../processes/builtin/file_index_selector.py  |  4 +-
 weaver/processes/builtin/jsonarray2netcdf.py  |  6 +-
 weaver/processes/builtin/metalink2netcdf.py   |  6 +-
 weaver/processes/builtin/utils.py             | 27 ++++++---
 10 files changed, 76 insertions(+), 48 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 2180e3df3..5c0e31782 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -12,6 +12,7 @@ Changes
 
 Changes:
 --------
+- Add more secure path validations steps before fetching contents.
 - Disallow ``builtin`` processes expecting a user-provided input path to run with local file references such that
   they must respect any configured server-side remote file access rules instead of bypassing security validations
   through resolved local paths.
diff --git a/tests/functional/test_builtin.py b/tests/functional/test_builtin.py
index 95b22e95d..bbe85886a 100644
--- a/tests/functional/test_builtin.py
+++ b/tests/functional/test_builtin.py
@@ -448,7 +448,7 @@ def test_jsonarray2netcdf_process():
         [
             "/tmp/does-not-exist/fake-file.txt",  # noqa
             "/tmp/does-not-exist/fake-file.nc",  # noqa
-        ]
+        ],
     ]
 )
 def test_jsonarray2netcdf_invalid_json(test_data):
@@ -461,7 +461,12 @@ def test_jsonarray2netcdf_invalid_json(test_data):
 
         with pytest.raises(ValueError) as err:
             jsonarray2netcdf.main("-i", tmp_file.name, "-o", tmp_out_dir)
-        assert str(err.value) == "Invalid JSON file format, expected a plain array of NetCDF file URL strings."
+        valid_errors = [
+            "Invalid JSON file format, expected a plain array of NetCDF file URL strings.",
+            "Invalid file format",
+            "Not a valid file URL reference",
+        ]
+        assert any(error in str(err.value) for error in valid_errors), f"Raised error [{err.value}] was not expected"
 
 
 @pytest.mark.parametrize(
diff --git a/tests/functional/test_cli.py b/tests/functional/test_cli.py
index be888f9fa..e629f104c 100644
--- a/tests/functional/test_cli.py
+++ b/tests/functional/test_cli.py
@@ -521,7 +521,7 @@ def test_execute_manual_monitor_status_and_download_results(self):
         """
         result = self.run_execute_inputs_schema_variant("Execute_Echo_cwl_schema.yml", mock_exec=False)
         job_id = result.body["jobID"]
-        result = mocked_sub_requests(self.app, self.client.monitor, job_id, timeout=1, interval=1)
+        result = mocked_sub_requests(self.app, self.client.monitor, job_id, timeout=5, interval=1)
         assert result.success, result.text
         assert "undefined" not in result.message
         assert result.body.get("status") == Status.SUCCEEDED
diff --git a/tests/functional/test_workflow.py b/tests/functional/test_workflow.py
index 16cdfaede..f82e52a44 100644
--- a/tests/functional/test_workflow.py
+++ b/tests/functional/test_workflow.py
@@ -15,7 +15,6 @@
 
 import mock
 import pytest
-from pytest_server_fixtures.http import SimpleHTTPTestServer
 from pyramid import testing
 from pyramid.httpexceptions import HTTPConflict, HTTPCreated, HTTPNotFound, HTTPOk
 from pyramid.settings import asbool
@@ -25,6 +24,7 @@
 
 from tests.functional.utils import ResourcesUtil
 from tests.utils import (
+    FileServer,
     get_settings_from_config_ini,
     get_settings_from_testapp,
     get_test_weaver_app,
@@ -127,7 +127,7 @@ def __init__(self,
 @pytest.mark.workflow
 class WorkflowTestRunnerBase(ResourcesUtil, TestCase):
     """
-    Runs an end-2-end test procedure on weaver configured as EMS located on specified `WEAVER_TEST_SERVER_HOSTNAME`.
+    Runs an end-2-end test procedure on weaver configured as EMS located on specified ``WEAVER_TEST_SERVER_HOSTNAME``.
     """
     __settings__ = None
     test_processes_info = {}        # type: Dict[WorkflowProcesses, ProcessInfo]
@@ -166,6 +166,11 @@ class WorkflowTestRunnerBase(ResourcesUtil, TestCase):
     logger_field_indent = 2         # type: int
     log_full_trace = True           # type: bool
 
+    file_server = None              # type: FileServer
+    """
+    File server made available to tests for emulating a remote HTTP location.
+    """
+
     WEAVER_URL = None               # type: Optional[str]
     WEAVER_RESTAPI_URL = None       # type: Optional[str]
 
@@ -215,15 +220,15 @@ def setUpClass(cls):
         cls.WEAVER_TEST_JOB_GET_STATUS_INTERVAL = int(cls.get_option("WEAVER_TEST_JOB_GET_STATUS_INTERVAL", 5))
 
         # server settings
-        cls.WEAVER_TEST_CONFIGURATION = cls.get_option("WEAVER_TEST_CONFIGURATION", WeaverConfiguration.EMS)
-        cls.WEAVER_TEST_SERVER_HOSTNAME = cls.get_option("WEAVER_TEST_SERVER_HOSTNAME", "")
+        cls.WEAVER_TEST_CONFIGURATION = conf = cls.get_option("WEAVER_TEST_CONFIGURATION", WeaverConfiguration.EMS)
+        cls.WEAVER_TEST_SERVER_HOSTNAME = host = cls.get_option("WEAVER_TEST_SERVER_HOSTNAME", "")
         cls.WEAVER_TEST_SERVER_BASE_PATH = cls.get_option("WEAVER_TEST_SERVER_BASE_PATH", "/weaver")
         cls.WEAVER_TEST_SERVER_API_PATH = cls.get_option("WEAVER_TEST_SERVER_API_PATH", "/")
         cls.WEAVER_TEST_CONFIG_INI_PATH = cls.get_option("WEAVER_TEST_CONFIG_INI_PATH")    # none uses default path
-        if cls.WEAVER_TEST_SERVER_HOSTNAME in [None, ""]:
+        if host in [None, ""]:
             # running with a local-only Web Test application
             config = setup_config_with_mongodb(settings={
-                "weaver.configuration": cls.WEAVER_TEST_CONFIGURATION,
+                "weaver.configuration": conf,
                 # NOTE:
                 #   Because everything is running locally in this case, all processes should automatically map between
                 #   the two following dir/URL as equivalents locations, accordingly to what they require for execution.
@@ -239,10 +244,10 @@ def setUpClass(cls):
             os.makedirs(cls.__settings__["weaver.wps_output_dir"], exist_ok=True)
         else:
             # running on a remote service (remote server or can be "localhost", but in parallel application)
-            if cls.WEAVER_TEST_SERVER_HOSTNAME.startswith("http"):
-                url = cls.WEAVER_TEST_SERVER_HOSTNAME
+            if host.startswith("http"):
+                url = host
             else:
-                url = f"http://{cls.WEAVER_TEST_SERVER_HOSTNAME}"
+                url = f"http://{host}"
             cls.app = WebTestApp(url)
             cls.WEAVER_URL = get_weaver_url(cls.settings.fget(cls))
         cls.WEAVER_RESTAPI_URL = get_wps_restapi_base_url(cls.settings.fget(cls))
@@ -252,6 +257,9 @@ def setUpClass(cls):
         cls.setup_test_processes()
         cls.setup_test_processes_after()
 
+        cls.file_server = FileServer()
+        cls.file_server.start()
+
     @property
     def settings(self):
         # type: (...) -> SettingsType
@@ -276,6 +284,7 @@ def settings(self):
     @classmethod
     def tearDownClass(cls):
         cls.clean_test_processes()
+        cls.file_server.teardown()
         testing.tearDown()
         cls.log("%sEnd of '%s': %s\n%s",
                 cls.logger_separator_cases, cls.current_case_name(), now(), cls.logger_separator_cases)
@@ -952,7 +961,7 @@ def test_workflow_mixed_wps1_builtin_rest_docker_select_requirements(self):
 
         .. note::
             Because ``jsonarray2netcdf`` is running in subprocess instantiated by :mod:`cwltool`, file-server
-            location cannot be mocked by the test suite. Employ local test paths as if they where already fetched.
+            location cannot be mocked by the test suite. Employ local test paths as if they were already fetched.
         """
 
         with contextlib.ExitStack() as stack:
@@ -1007,10 +1016,8 @@ def test_workflow_mixed_rest_builtin_wps1_docker_select_requirements(self):
         with contextlib.ExitStack() as stack:
             tmp_host = "https://mocked-file-server.com"  # must match in 'Execute_WorkflowSelectCopyNestedOutDir.json'
             # jsonarray2netcdf cannot use local paths anymore for nested NetCDF, provide them through tmp file server
-            file_server = stack.enter_context(SimpleHTTPTestServer())
-            file_server.start()
-            nc_dir = file_server.document_root
-            nc_url = file_server.uri
+            nc_dir = self.file_server.document_root
+            nc_url = self.file_server.uri
             nc_refs = []
             for i in range(3):
                 nc_name = f"test-file-{i}.nc"
@@ -1059,7 +1066,9 @@ def test_workflow_mixed_rest_builtin_wps1_docker_scatter_requirements(self):
 
         .. note::
             Because ``jsonarray2netcdf`` is running in subprocess instantiated by :mod:`cwltool`, file-server
-            location cannot be mocked by the test suite. Employ local test paths as if they where already fetched.
+            location cannot be mocked by the test suite. With security checks, they cannot be locally defined either,
+            since the :term:`CWL` temporary directory is not known in advance for respective ``CommandLineTool`` steps.
+            Employ a *real* file-server for test paths to emulate a remote location to be fetched.
 
         .. seealso::
             Inverse :term:`WPS` / :term:`OGC API - Processes` process references from
@@ -1072,9 +1081,10 @@ def test_workflow_mixed_rest_builtin_wps1_docker_scatter_requirements(self):
             nc_refs = []
             for i in range(3):
                 nc_name = f"test-file-{i}.nc"
-                nc_path = os.path.join(tmp_dir, nc_name)
-                nc_refs.append(f"file://{nc_path}")
-                with open(os.path.join(tmp_dir, nc_name), mode="w", encoding="utf-8") as tmp_file:
+                nc_path = os.path.join(self.file_server.document_root, nc_name)
+                nc_href = os.path.join(self.file_server.uri, nc_name)
+                nc_refs.append(nc_href)
+                with open(nc_path, mode="w", encoding="utf-8") as tmp_file:
                     tmp_file.write(f"DUMMY NETCDF DATA #{i}")
             with open(os.path.join(tmp_dir, "netcdf-array.json"), mode="w", encoding="utf-8") as tmp_file:
                 json.dump(nc_refs, tmp_file)  # must match execution body
@@ -1099,7 +1109,9 @@ def test_workflow_mixed_wps1_builtin_rest_docker_scatter_requirements(self):
 
         .. note::
             Because ``jsonarray2netcdf`` is running in subprocess instantiated by :mod:`cwltool`, file-server
-            location cannot be mocked by the test suite. Employ local test paths as if they where already fetched.
+            location cannot be mocked by the test suite. With security checks, they cannot be locally defined either,
+            since the :term:`CWL` temporary directory is not known in advance for respective ``CommandLineTool`` steps.
+            Employ a *real* file-server for test paths to emulate a remote location to be fetched.
 
         .. seealso::
             Inverse :term:`WPS` / :term:`OGC API - Processes` process references from
@@ -1112,9 +1124,10 @@ def test_workflow_mixed_wps1_builtin_rest_docker_scatter_requirements(self):
             nc_refs = []
             for i in range(3):
                 nc_name = f"test-file-{i}.nc"
-                nc_path = os.path.join(tmp_dir, nc_name)
-                nc_refs.append(f"file://{nc_path}")
-                with open(os.path.join(tmp_dir, nc_name), mode="w", encoding="utf-8") as tmp_file:
+                nc_path = os.path.join(self.file_server.document_root, nc_name)
+                nc_href = os.path.join(self.file_server.uri, nc_name)
+                nc_refs.append(nc_href)
+                with open(nc_path, mode="w", encoding="utf-8") as tmp_file:
                     tmp_file.write(f"DUMMY NETCDF DATA #{i}")
             with open(os.path.join(tmp_dir, "netcdf-array.json"), mode="w", encoding="utf-8") as tmp_file:
                 json.dump(nc_refs, tmp_file)  # must match execution body
diff --git a/tests/utils.py b/tests/utils.py
index 31d3edcc9..f9eb4c7b8 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -877,18 +877,18 @@ class FileServer(SimpleHTTPTestServer):
     Contrary to :func:`mocked_file_server` where requests are captured and redirected to the corresponding files,
     this server receives *real* requests (via a socket) and returns matched files. This is particularly important
     during tests that call subprocesses independently of the main Python test process
-    (e.g.: :term:`CWL` ``CommandLineTool` for a ``Builtin`` :term:`Process`), because mocks applied in the main
-    process are not reflected in the other ones.
+    (e.g.: :term:`CWL` ``CommandLineTool` for a ``Builtin`` :term:`Process`), because mocks applied in the main test
+    process are not reflected in the other subprocesses.
 
     .. warning::
         This server takes more time to start than usual mocks. Use it sparingly, and consider maintaining a single
-         instance over multiple tests of a complete test suite rather than recreating a server for each test.
+        instance over multiple tests of a complete test suite rather than recreating a server for each test.
     """
     def __init__(self):  # pylint: disable=W0231
         self._port = self.get_port()
         self._uri = f"http://0.0.0.0:{self._port}"
 
-        # purposely call 'HTTPTestServer' instead of 'SimpleHTTPTestServer' to enforce the URI
+        # purposely call 'HTTPTestServer' instead of 'SimpleHTTPTestServer' to enforce the URI hostname
         # otherwise, 'socket.gethostname()' is used (machine name), and the obtained URI fails our schema validation
         HTTPTestServer.__init__(self, hostname="0.0.0.0", port=self._port, uri=self._uri)  # pylint: disable=W0233
         self.cwd = self.document_root
diff --git a/weaver/processes/builtin/file2string_array.py b/weaver/processes/builtin/file2string_array.py
index 858384843..e5977462c 100644
--- a/weaver/processes/builtin/file2string_array.py
+++ b/weaver/processes/builtin/file2string_array.py
@@ -16,7 +16,7 @@
 # place weaver specific imports after sys path fixing to ensure they are found from external call
 # pylint: disable=C0413,wrong-import-order
 from weaver import WEAVER_ROOT_DIR  # isort:skip # noqa: E402
-from weaver.processes.builtin.utils import validate_file_reference  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import validate_reference  # isort:skip # noqa: E402
 
 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
 PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0]
@@ -38,7 +38,7 @@
 def process(input_file, output_dir):
     # type: (str, str) -> None
     LOGGER.info("Got arguments: input_file=%s output_dir=%s", input_file, output_dir)
-    validate_file_reference(input_file)
+    validate_reference(input_file, is_file=True)
     output_data = {"output": [input_file]}
     with open(os.path.join(output_dir, OUTPUT_CWL_JSON), mode="w", encoding="utf-8") as file:
         return json.dump(output_data, file)
diff --git a/weaver/processes/builtin/file_index_selector.py b/weaver/processes/builtin/file_index_selector.py
index 37b245426..6396a0814 100644
--- a/weaver/processes/builtin/file_index_selector.py
+++ b/weaver/processes/builtin/file_index_selector.py
@@ -20,7 +20,7 @@
 # place weaver specific imports after sys path fixing to ensure they are found from external call
 # pylint: disable=C0413,wrong-import-order
 from weaver import WEAVER_ROOT_DIR  # isort:skip # noqa: E402
-from weaver.processes.builtin.utils import validate_file_reference  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import validate_reference  # isort:skip # noqa: E402
 from weaver.utils import OutputMethod, fetch_file  # isort:skip # noqa: E402
 
 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
@@ -46,7 +46,7 @@ def select(files, index, output_dir):
         if not os.path.isdir(output_dir):
             raise ValueError(f"Output dir [{output_dir}] does not exist.")
         file_path = files[index]
-        validate_file_reference(file_path)
+        validate_reference(file_path, is_file=True)
         fetch_file(file_path, output_dir, out_method=OutputMethod.COPY)
     except Exception as exc:
         # log only debug for tracking, re-raise and actual error wil be logged by top process monitor
diff --git a/weaver/processes/builtin/jsonarray2netcdf.py b/weaver/processes/builtin/jsonarray2netcdf.py
index ce9edc2f8..1d97e951b 100644
--- a/weaver/processes/builtin/jsonarray2netcdf.py
+++ b/weaver/processes/builtin/jsonarray2netcdf.py
@@ -19,7 +19,7 @@
 # pylint: disable=C0413,wrong-import-order
 from weaver import WEAVER_ROOT_DIR  # isort:skip # noqa: E402
 from weaver.formats import repr_json  # isort:skip # noqa: E402
-from weaver.processes.builtin.utils import is_netcdf_url, validate_file_reference  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import is_netcdf_url, validate_reference  # isort:skip # noqa: E402
 from weaver.utils import fetch_file, get_secure_path  # isort:skip # noqa: E402
 
 PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
@@ -48,7 +48,7 @@ def j2n(json_reference, output_dir):
             raise ValueError(f"Output directory [{output_dir}] does not exist.")
         with TemporaryDirectory(prefix=f"wps_process_{PACKAGE_NAME}_") as tmp_dir:
             LOGGER.info("Verify URL reference: [%s]", json_reference)
-            validate_file_reference(json_reference)
+            validate_reference(json_reference, is_file=True)
             LOGGER.info("Fetching JSON file: [%s]", json_reference)
             json_path = fetch_file(json_reference, tmp_dir, timeout=10, retry=3)
             json_path = get_secure_path(json_path)
@@ -61,7 +61,7 @@ def j2n(json_reference, output_dir):
             LOGGER.info("Parsing JSON file references from file contents:\n%s", repr_json(json_content))
             for file_url in json_content:
                 LOGGER.info("Validate NetCDF reference from JSON file: [%s]", file_url)
-                validate_file_reference(file_url)
+                validate_reference(file_url, is_file=True)
                 if not is_netcdf_url(file_url):
                     raise ValueError(f"Invalid file format for [{file_url}], expected a NetCDF file URL.")
                 LOGGER.info("Fetching NetCDF reference from JSON file: [%s]", file_url)
diff --git a/weaver/processes/builtin/metalink2netcdf.py b/weaver/processes/builtin/metalink2netcdf.py
index f31c49a4f..3173a7bbe 100644
--- a/weaver/processes/builtin/metalink2netcdf.py
+++ b/weaver/processes/builtin/metalink2netcdf.py
@@ -17,7 +17,7 @@
 # place weaver specific imports after sys path fixing to ensure they are found from external call
 # pylint: disable=C0413,wrong-import-order
 from weaver import WEAVER_ROOT_DIR, xml_util  # isort:skip # noqa: E402
-from weaver.processes.builtin.utils import validate_file_reference  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import validate_reference  # isort:skip # noqa: E402
 from weaver.utils import fetch_file  # isort:skip # noqa: E402
 from weaver.processes.builtin.utils import is_netcdf_url  # isort:skip # noqa: E402
 
@@ -47,7 +47,7 @@ def m2n(metalink_reference, index, output_dir):
         if not os.path.isdir(output_dir):
             raise ValueError(f"Output dir [{output_dir}] does not exist.")
         LOGGER.info("Validating Metalink file: [%s]", metalink_reference)
-        validate_file_reference(metalink_reference)
+        validate_reference(metalink_reference, is_file=True)
         with TemporaryDirectory(prefix=f"wps_process_{PACKAGE_NAME}_") as tmp_dir:
             LOGGER.info("Fetching Metalink file: [%s]", metalink_reference)
             metalink_path = fetch_file(metalink_reference, tmp_dir, timeout=10, retry=3)
@@ -67,7 +67,7 @@ def m2n(metalink_reference, index, output_dir):
             if not is_netcdf_url(nc_file_url):
                 raise ValueError(f"Resolved file URL [{nc_file_url}] is not a valid NetCDF reference.")
             LOGGER.info("Validating NetCDF reference: [%s]", nc_file_url)
-            validate_file_reference(nc_file_url)
+            validate_reference(nc_file_url, is_file=True)
             LOGGER.info("Fetching NetCDF reference [%s] from Metalink file [%s]", nc_file_url, metalink_reference)
             fetch_file(nc_file_url, output_dir)
     except Exception as exc:
diff --git a/weaver/processes/builtin/utils.py b/weaver/processes/builtin/utils.py
index 7ec9a863f..fba144a5e 100644
--- a/weaver/processes/builtin/utils.py
+++ b/weaver/processes/builtin/utils.py
@@ -1,4 +1,5 @@
 import os
+import tempfile
 from typing import Any
 from urllib.parse import urlparse
 
@@ -11,32 +12,40 @@ def is_netcdf_url(url):
     Validates that the reference is a remote NetCDF file reference.
     """
     try:
-        validate_file_reference(url)
+        validate_reference(url, is_file=True)
     except (TypeError, ValueError):
         return False
     return os.path.splitext(url)[-1] == get_extension(ContentType.APP_NETCDF)
 
 
-def validate_file_reference(url):
-    # type: (str) -> None
+def validate_reference(url, is_file):
+    # type: (str, bool) -> None
     """
-    Ensures that the provided reference points to a valide remote file or a temporary :term:`CWL` intermediate file.
+    Ensures that the provided reference points to a valid remote file or a temporary intermediate file.
 
     In order to avoid bypassing security validation of server file access between jobs, remote locations must be
     enforced. However, :term:`CWL` temporary files must be allowed through for intermediate locations passed around
-    between :term:`Workflow` steps.
+    between :term:`Workflow` steps or employed as temporary writing locations for file extraction purposes.
     """
     if not isinstance(url, str):
         raise TypeError(f"Not a valid URL: [{url!s}]")
-    if url.endswith("/"):
-        raise ValueError(f"Not a valid file URL reference [{url}]. Directory not supported.")
-    cwl_files = [
+    if (is_file and url.endswith("/")) or (not is_file and not url.endswith("/")):
+        dir_msg = "not supported" if is_file else "required"
+        raise ValueError(f"Not a valid file URL reference [{url}]. Directory path {dir_msg}.")
+    # When in a CWL step, tempdir will return the `/tmp/cwltool_tmp_...' path (since enforced by the tool).
+    # When executed in other situations, it will map to the environment variable or platform-specific tmp path.
+    # Although CWL will set TMPDIR for the current step, the source file could be coming from a previous step.
+    # Therefore, the random part of the path after 'cwltool_tmp_'/'cwltool_out_' could differ from the current ones.
+    tmp_dir = tempfile.gettempdir()
+    tmp_paths = [
+        f"file://{tmp_dir}/",
+        f"{tmp_dir}/",
         "file:///tmp/cwltool_out_",
         "file:///tmp/cwltool_tmp_",
         "/tmp/cwltool_out_",
         "/tmp/cwltool_tmp_",
     ]
-    if any(url.startswith(path) for path in cwl_files):
+    if any(url.startswith(path) for path in tmp_paths):
         return
     if urlparse(url).scheme not in ["http", "https", "s3"]:
         raise ValueError(f"Not a valid file URL reference [{url}]. Scheme not supported.")

From 31b82afe0f83dca52e1bbba30a10cee30c6929f7 Mon Sep 17 00:00:00 2001
From: Francis Charette Migneault <francis.charette.migneault@gmail.com>
Date: Fri, 3 Nov 2023 18:14:43 -0400
Subject: [PATCH 4/5] ignore purposely set temp paths

---
 weaver/processes/builtin/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/weaver/processes/builtin/utils.py b/weaver/processes/builtin/utils.py
index fba144a5e..1db2a1e6e 100644
--- a/weaver/processes/builtin/utils.py
+++ b/weaver/processes/builtin/utils.py
@@ -42,8 +42,8 @@ def validate_reference(url, is_file):
         f"{tmp_dir}/",
         "file:///tmp/cwltool_out_",
         "file:///tmp/cwltool_tmp_",
-        "/tmp/cwltool_out_",
-        "/tmp/cwltool_tmp_",
+        "/tmp/cwltool_out_",  # nosec: B108
+        "/tmp/cwltool_tmp_",  # nosec: B108
     ]
     if any(url.startswith(path) for path in tmp_paths):
         return

From b4f0fa1e46eaa1394ce7e2bb3b16134aec68df8b Mon Sep 17 00:00:00 2001
From: Francis Charette Migneault <francis.charette.migneault@gmail.com>
Date: Fri, 3 Nov 2023 18:41:41 -0400
Subject: [PATCH 5/5] fix lint and reduce builtin process duplicated logic

---
 weaver/processes/builtin/file2string_array.py  |  7 ++-----
 .../processes/builtin/file_index_selector.py   |  7 ++-----
 weaver/processes/builtin/jsonarray2netcdf.py   | 11 ++++++-----
 weaver/processes/builtin/metalink2netcdf.py    |  8 +++-----
 weaver/processes/builtin/utils.py              | 18 +++++++++++++++++-
 5 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/weaver/processes/builtin/file2string_array.py b/weaver/processes/builtin/file2string_array.py
index e5977462c..6c0b90eff 100644
--- a/weaver/processes/builtin/file2string_array.py
+++ b/weaver/processes/builtin/file2string_array.py
@@ -15,12 +15,9 @@
 
 # place weaver specific imports after sys path fixing to ensure they are found from external call
 # pylint: disable=C0413,wrong-import-order
-from weaver import WEAVER_ROOT_DIR  # isort:skip # noqa: E402
-from weaver.processes.builtin.utils import validate_reference  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import get_package_details, validate_reference  # isort:skip # noqa: E402
 
-PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
-PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0]
-PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
+PACKAGE_NAME, PACKAGE_BASE, PACKAGE_MODULE = get_package_details(__file__)
 
 # setup logger since it is not run from the main 'weaver' app
 LOGGER = logging.getLogger(PACKAGE_MODULE)
diff --git a/weaver/processes/builtin/file_index_selector.py b/weaver/processes/builtin/file_index_selector.py
index 6396a0814..947f3bdcf 100644
--- a/weaver/processes/builtin/file_index_selector.py
+++ b/weaver/processes/builtin/file_index_selector.py
@@ -19,13 +19,10 @@
 
 # place weaver specific imports after sys path fixing to ensure they are found from external call
 # pylint: disable=C0413,wrong-import-order
-from weaver import WEAVER_ROOT_DIR  # isort:skip # noqa: E402
-from weaver.processes.builtin.utils import validate_reference  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import get_package_details, validate_reference  # isort:skip # noqa: E402
 from weaver.utils import OutputMethod, fetch_file  # isort:skip # noqa: E402
 
-PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
-PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0]
-PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
+PACKAGE_NAME, PACKAGE_BASE, PACKAGE_MODULE = get_package_details(__file__)
 
 # setup logger since it is not run from the main 'weaver' app
 LOGGER = logging.getLogger(PACKAGE_MODULE)
diff --git a/weaver/processes/builtin/jsonarray2netcdf.py b/weaver/processes/builtin/jsonarray2netcdf.py
index 1d97e951b..012dbb84f 100644
--- a/weaver/processes/builtin/jsonarray2netcdf.py
+++ b/weaver/processes/builtin/jsonarray2netcdf.py
@@ -17,14 +17,15 @@
 
 # place weaver specific imports after sys path fixing to ensure they are found from external call
 # pylint: disable=C0413,wrong-import-order
-from weaver import WEAVER_ROOT_DIR  # isort:skip # noqa: E402
 from weaver.formats import repr_json  # isort:skip # noqa: E402
-from weaver.processes.builtin.utils import is_netcdf_url, validate_reference  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import (  # isort:skip # noqa: E402
+    get_package_details,
+    is_netcdf_url,
+    validate_reference
+)
 from weaver.utils import fetch_file, get_secure_path  # isort:skip # noqa: E402
 
-PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
-PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0]
-PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
+PACKAGE_NAME, PACKAGE_BASE, PACKAGE_MODULE = get_package_details(__file__)
 
 # setup logger since it is not run from the main 'weaver' app
 LOGGER = logging.getLogger(PACKAGE_MODULE)
diff --git a/weaver/processes/builtin/metalink2netcdf.py b/weaver/processes/builtin/metalink2netcdf.py
index 3173a7bbe..44819bdf4 100644
--- a/weaver/processes/builtin/metalink2netcdf.py
+++ b/weaver/processes/builtin/metalink2netcdf.py
@@ -16,14 +16,12 @@
 
 # place weaver specific imports after sys path fixing to ensure they are found from external call
 # pylint: disable=C0413,wrong-import-order
-from weaver import WEAVER_ROOT_DIR, xml_util  # isort:skip # noqa: E402
-from weaver.processes.builtin.utils import validate_reference  # isort:skip # noqa: E402
+from weaver import xml_util  # isort:skip # noqa: E402
+from weaver.processes.builtin.utils import get_package_details, validate_reference  # isort:skip # noqa: E402
 from weaver.utils import fetch_file  # isort:skip # noqa: E402
 from weaver.processes.builtin.utils import is_netcdf_url  # isort:skip # noqa: E402
 
-PACKAGE_NAME = os.path.split(os.path.splitext(__file__)[0])[-1]
-PACKAGE_BASE = __file__.rsplit(WEAVER_ROOT_DIR.rstrip("/") + "/", 1)[-1].rsplit(PACKAGE_NAME)[0]
-PACKAGE_MODULE = f"{PACKAGE_BASE}{PACKAGE_NAME}".replace("/", ".")
+PACKAGE_NAME, PACKAGE_BASE, PACKAGE_MODULE = get_package_details(__file__)
 
 # setup logger since it is not run from the main 'weaver' app
 LOGGER = logging.getLogger(PACKAGE_MODULE)
diff --git a/weaver/processes/builtin/utils.py b/weaver/processes/builtin/utils.py
index 1db2a1e6e..c7c52ebf4 100644
--- a/weaver/processes/builtin/utils.py
+++ b/weaver/processes/builtin/utils.py
@@ -1,10 +1,14 @@
 import os
 import tempfile
-from typing import Any
+from typing import TYPE_CHECKING
 from urllib.parse import urlparse
 
+from weaver import WEAVER_ROOT_DIR
 from weaver.formats import ContentType, get_extension
 
+if TYPE_CHECKING:
+    from typing import Any, Tuple
+
 
 def is_netcdf_url(url):
     # type: (Any) -> bool
@@ -49,3 +53,15 @@ def validate_reference(url, is_file):
         return
     if urlparse(url).scheme not in ["http", "https", "s3"]:
         raise ValueError(f"Not a valid file URL reference [{url}]. Scheme not supported.")
+
+
+def get_package_details(file):
+    # type: (os.PathLike[str]) -> Tuple[str, str, str]
+    """
+    Obtains the ``builtin`` process details from its file reference.
+    """
+    name = os.path.split(os.path.splitext(file)[0])[-1]
+    root = WEAVER_ROOT_DIR.rstrip("/")  # avoid double //
+    path = str(file).rsplit(f"{root}/", 1)[-1].rsplit(name)[0]
+    mod = f"{path}{name}".replace("/", ".")
+    return name, path, mod