From dea0b8e6e0492ad5c67a840d73034ea3921e7990 Mon Sep 17 00:00:00 2001
From: "Michael R. Crusoe" <michael.crusoe@gmail.com>
Date: Mon, 18 Dec 2023 12:58:39 +0100
Subject: [PATCH] store label & doc fields as prospective provenance

TODO: fix intent list

add/amend tests
---
 build-cwltool-docker.sh               |  2 +-
 cwltool/cwlprov/provenance_profile.py | 74 +++++++++++++++------------
 cwltool/singularity.py                |  2 +-
 tests/test_provenance.py              |  4 +-
 4 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/build-cwltool-docker.sh b/build-cwltool-docker.sh
index a70fdf4df..3f20df771 100755
--- a/build-cwltool-docker.sh
+++ b/build-cwltool-docker.sh
@@ -8,4 +8,4 @@ ${engine} run -t -v /var/run/docker.sock:/var/run/docker.sock \
 	-v /tmp:/tmp \
 	-v "$PWD":/tmp/cwltool \
 	quay.io/commonwl/cwltool_module /bin/sh -c \
-	"apk add gcc bash git && pip install -r/tmp/cwltool/test-requirements.txt ; pytest -k 'not (test_bioconda or test_double_overwrite or test_env_filtering or test_biocontainers or test_disable_file_overwrite_without_ext or test_disable_file_creation_in_outdir_with_ext or test_write_write_conflict or test_directory_literal_with_real_inputs_inside or test_revsort_workflow or test_stdin_with_id_preset or test_no_compute_chcksum or test_packed_workflow_execution[tests/wf/count-lines1-wf.cwl-tests/wf/wc-job.json-False] or test_sequential_workflow or test_single_process_subwf_subwf_inline_step)' --ignore-glob '*test_udocker.py' -n 0 -v -rs --pyargs cwltool"
+	"apk add gcc bash git && pip install -r/tmp/cwltool/test-requirements.txt ; pytest -k 'not (test_bioconda or test_double_overwrite or test_env_filtering or test_biocontainers or test_disable_file_overwrite_without_ext or test_disable_file_creation_in_outdir_with_ext or test_write_write_conflict or test_directory_literal_with_real_inputs_inside or test_revsort_workflow or test_revsort_label_annotations or test_stdin_with_id_preset or test_no_compute_chcksum or test_packed_workflow_execution[tests/wf/count-lines1-wf.cwl-tests/wf/wc-job.json-False] or test_sequential_workflow or test_single_process_subwf_subwf_inline_step)' --ignore-glob '*test_udocker.py' -n 0 -v -rs --pyargs cwltool"
diff --git a/cwltool/cwlprov/provenance_profile.py b/cwltool/cwlprov/provenance_profile.py
index c8ceee232..1ed42ec28 100644
--- a/cwltool/cwlprov/provenance_profile.py
+++ b/cwltool/cwlprov/provenance_profile.py
@@ -51,9 +51,14 @@
 )
 from .writablebagfile import create_job, write_bag_file  # change this later
 
+# from schema_salad.utils import convert_to_dict
+
+
 if TYPE_CHECKING:
     from .ro import ResearchObject
 
+ProvType = Dict[Union[str, Identifier], Any]
+
 
 def copy_job_order(job: Union[Process, JobsType], job_order_object: CWLObjectType) -> CWLObjectType:
     """Create copy of job object for provenance."""
@@ -177,14 +182,14 @@ def host_provenance(document: ProvDocument) -> None:
         # by a user account, as cwltool is a command line tool
         account = self.document.agent(ACCOUNT_UUID)
         if self.orcid or self.full_name:
-            person: Dict[Union[str, Identifier], Any] = {
+            person: ProvType = {
                 PROV_TYPE: PROV["Person"],
                 "prov:type": SCHEMA["Person"],
             }
             if self.full_name:
                 person["prov:label"] = self.full_name
                 person["foaf:name"] = self.full_name
-                person["schema:name"] = self.full_name
+                person[SCHEMA["name"]] = self.full_name
             else:
                 # TODO: Look up name from ORCID API?
                 pass
@@ -235,13 +240,13 @@ def evaluate(
         """Evaluate the nature of job."""
         if not hasattr(process, "steps"):
             # record provenance of independent commandline tool executions
-            self.prospective_prov(job)
+            self.prospective_prov(job, process)
             customised_job = copy_job_order(job, job_order_object)
             self.used_artefacts(customised_job, self.workflow_run_uri)
             create_job(research_obj, customised_job)
         elif hasattr(job, "workflow"):
             # record provenance of workflow executions
-            self.prospective_prov(job)
+            self.prospective_prov(job, process)
             customised_job = copy_job_order(job, job_order_object)
             self.used_artefacts(customised_job, self.workflow_run_uri)
             # if CWLPROV['prov'].uri in job_order_object: # maybe move this to another place
@@ -306,8 +311,7 @@ def _add_nested_annotations(
     ) -> ProvEntity:
         """Propagate input data annotations to provenance."""
         # Change https:// into http:// first
-        schema2_uri = "https://schema.org/"
-        if schema2_uri in annotation_key:
+        if (schema2_uri := "https://schema.org/") in annotation_key:
             annotation_key = SCHEMA[annotation_key.replace(schema2_uri, "")].uri
 
         if not isinstance(annotation_value, (MutableSequence, MutableMapping)):
@@ -377,9 +381,9 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st
         self.document.specializationOf(file_entity, entity)
 
         # Identify all schema annotations
-        schema_annotations = dict(
-            [(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")]
-        )
+        schema_annotations = {
+            v: value[v] for v in value.keys() if v.startswith("https://schema.org")
+        }
 
         # Transfer SCHEMA annotations to provenance
         for s in schema_annotations:
@@ -509,9 +513,9 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity:
         coll_b.add_attributes(coll_b_attribs)
 
         # Identify all schema annotations
-        schema_annotations = dict(
-            [(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")]
-        )
+        schema_annotations = {
+            v: value[v] for v in value.keys() if v.startswith("https://schema.org")
+        }
 
         # Transfer SCHEMA annotations to provenance
         for s in schema_annotations:
@@ -571,7 +575,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
             self.research_object.add_uri(entity.identifier.uri)
             return entity
 
-        if isinstance(value, (str, str)):
+        if isinstance(value, str):
             (entity, _) = self.declare_string(value)
             return entity
 
@@ -734,35 +738,39 @@ def generate_output_prov(
                     entity, process_run_id, timestamp, None, {"prov:role": role}
                 )
 
-    def prospective_prov(self, job: JobsType) -> None:
+    def prospective_prov(self, job: JobsType, process: Process) -> None:
         """Create prospective prov recording as wfdesc prov:Plan."""
+        prov_items: ProvType = {
+            PROV_TYPE: WFDESC["Workflow"] if isinstance(job, WorkflowJob) else WFDESC["Process"],
+            "prov:type": PROV["Plan"],
+            "prov:label": "Prospective provenance",
+        }
+        if "doc" in process.tool:
+            prov_items[SCHEMA["description"]] = process.tool["doc"]
+        if "label" in process.tool:
+            prov_items[SCHEMA["name"]] = process.tool["label"]
+        # # TypeError: unhashable type: 'list'
+        # if "intent" in process.tool:
+        #     prov_items[SCHEMA["featureList"]] = convert_to_dict(process.tool["intent"])
+        self.document.entity("wf:main", prov_items)
         if not isinstance(job, WorkflowJob):
-            # direct command line tool execution
-            self.document.entity(
-                "wf:main",
-                {
-                    PROV_TYPE: WFDESC["Process"],
-                    "prov:type": PROV["Plan"],
-                    "prov:label": "Prospective provenance",
-                },
-            )
             return
 
-        self.document.entity(
-            "wf:main",
-            {
-                PROV_TYPE: WFDESC["Workflow"],
-                "prov:type": PROV["Plan"],
-                "prov:label": "Prospective provenance",
-            },
-        )
-
         for step in job.steps:
             stepnametemp = "wf:main/" + str(step.name)[5:]
             stepname = urllib.parse.quote(stepnametemp, safe=":/,#")
+            provstep_items: ProvType = {
+                PROV_TYPE: WFDESC["Process"],
+                "prov:type": PROV["Plan"],
+            }
+            # WorkflowStep level annotations
+            if "doc" in step.tool:
+                provstep_items[SCHEMA["description"]] = step.tool["doc"]
+            if "label" in step.tool:
+                provstep_items[SCHEMA["name"]] = step.tool["label"]
             provstep = self.document.entity(
                 stepname,
-                {PROV_TYPE: WFDESC["Process"], "prov:type": PROV["Plan"]},
+                provstep_items,
             )
             self.document.entity(
                 "wf:main",
diff --git a/cwltool/singularity.py b/cwltool/singularity.py
index 2f590a140..1277092e3 100644
--- a/cwltool/singularity.py
+++ b/cwltool/singularity.py
@@ -369,7 +369,7 @@ def add_writable_file_volume(
             if self.inplace_update:
                 try:
                     os.link(os.path.realpath(volume.resolved), host_outdir_tgt)
-                except os.error:
+                except OSError:
                     shutil.copy(volume.resolved, host_outdir_tgt)
             else:
                 shutil.copy(volume.resolved, host_outdir_tgt)
diff --git a/tests/test_provenance.py b/tests/test_provenance.py
index 5dbe27d7c..d2cb7e0db 100644
--- a/tests/test_provenance.py
+++ b/tests/test_provenance.py
@@ -34,9 +34,11 @@
 
 
 def cwltool(tmp_path: Path, *args: Any) -> Path:
+    out_folder = tmp_path / "out"
+    out_folder.mkdir()
     prov_folder = tmp_path / "provenance"
     prov_folder.mkdir()
-    new_args = ["--provenance", str(prov_folder)]
+    new_args = ["--provenance", str(prov_folder), "--outdir", str(out_folder)]
     new_args.extend(args)
     # Run within a temporary directory to not pollute git checkout
     tmp_dir = tmp_path / "cwltool-run"