From dea0b8e6e0492ad5c67a840d73034ea3921e7990 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Mon, 18 Dec 2023 12:58:39 +0100 Subject: [PATCH] store label & doc fields as prospective provenance TODO: fix intent list add/amend tests --- build-cwltool-docker.sh | 2 +- cwltool/cwlprov/provenance_profile.py | 74 +++++++++++++++------------ cwltool/singularity.py | 2 +- tests/test_provenance.py | 4 +- 4 files changed, 46 insertions(+), 36 deletions(-) diff --git a/build-cwltool-docker.sh b/build-cwltool-docker.sh index a70fdf4df..3f20df771 100755 --- a/build-cwltool-docker.sh +++ b/build-cwltool-docker.sh @@ -8,4 +8,4 @@ ${engine} run -t -v /var/run/docker.sock:/var/run/docker.sock \ -v /tmp:/tmp \ -v "$PWD":/tmp/cwltool \ quay.io/commonwl/cwltool_module /bin/sh -c \ - "apk add gcc bash git && pip install -r/tmp/cwltool/test-requirements.txt ; pytest -k 'not (test_bioconda or test_double_overwrite or test_env_filtering or test_biocontainers or test_disable_file_overwrite_without_ext or test_disable_file_creation_in_outdir_with_ext or test_write_write_conflict or test_directory_literal_with_real_inputs_inside or test_revsort_workflow or test_stdin_with_id_preset or test_no_compute_chcksum or test_packed_workflow_execution[tests/wf/count-lines1-wf.cwl-tests/wf/wc-job.json-False] or test_sequential_workflow or test_single_process_subwf_subwf_inline_step)' --ignore-glob '*test_udocker.py' -n 0 -v -rs --pyargs cwltool" + "apk add gcc bash git && pip install -r/tmp/cwltool/test-requirements.txt ; pytest -k 'not (test_bioconda or test_double_overwrite or test_env_filtering or test_biocontainers or test_disable_file_overwrite_without_ext or test_disable_file_creation_in_outdir_with_ext or test_write_write_conflict or test_directory_literal_with_real_inputs_inside or test_revsort_workflow or test_revsort_label_annotations or test_stdin_with_id_preset or test_no_compute_chcksum or test_packed_workflow_execution[tests/wf/count-lines1-wf.cwl-tests/wf/wc-job.json-False] or test_sequential_workflow or test_single_process_subwf_subwf_inline_step)' --ignore-glob '*test_udocker.py' -n 0 -v -rs --pyargs cwltool" diff --git a/cwltool/cwlprov/provenance_profile.py b/cwltool/cwlprov/provenance_profile.py index c8ceee232..1ed42ec28 100644 --- a/cwltool/cwlprov/provenance_profile.py +++ b/cwltool/cwlprov/provenance_profile.py @@ -51,9 +51,14 @@ ) from .writablebagfile import create_job, write_bag_file # change this later +# from schema_salad.utils import convert_to_dict + + if TYPE_CHECKING: from .ro import ResearchObject +ProvType = Dict[Union[str, Identifier], Any] + def copy_job_order(job: Union[Process, JobsType], job_order_object: CWLObjectType) -> CWLObjectType: """Create copy of job object for provenance.""" @@ -177,14 +182,14 @@ def host_provenance(document: ProvDocument) -> None: # by a user account, as cwltool is a command line tool account = self.document.agent(ACCOUNT_UUID) if self.orcid or self.full_name: - person: Dict[Union[str, Identifier], Any] = { + person: ProvType = { PROV_TYPE: PROV["Person"], "prov:type": SCHEMA["Person"], } if self.full_name: person["prov:label"] = self.full_name person["foaf:name"] = self.full_name - person["schema:name"] = self.full_name + person[SCHEMA["name"]] = self.full_name else: # TODO: Look up name from ORCID API? pass @@ -235,13 +240,13 @@ def evaluate( """Evaluate the nature of job.""" if not hasattr(process, "steps"): # record provenance of independent commandline tool executions - self.prospective_prov(job) + self.prospective_prov(job, process) customised_job = copy_job_order(job, job_order_object) self.used_artefacts(customised_job, self.workflow_run_uri) create_job(research_obj, customised_job) elif hasattr(job, "workflow"): # record provenance of workflow executions - self.prospective_prov(job) + self.prospective_prov(job, process) customised_job = copy_job_order(job, job_order_object) self.used_artefacts(customised_job, self.workflow_run_uri) # if CWLPROV['prov'].uri in job_order_object: # maybe move this to another place @@ -306,8 +311,7 @@ def _add_nested_annotations( ) -> ProvEntity: """Propagate input data annotations to provenance.""" # Change https:// into http:// first - schema2_uri = "https://schema.org/" - if schema2_uri in annotation_key: + if (schema2_uri := "https://schema.org/") in annotation_key: annotation_key = SCHEMA[annotation_key.replace(schema2_uri, "")].uri if not isinstance(annotation_value, (MutableSequence, MutableMapping)): @@ -377,9 +381,9 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st self.document.specializationOf(file_entity, entity) # Identify all schema annotations - schema_annotations = dict( - [(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")] - ) + schema_annotations = { + v: value[v] for v in value.keys() if v.startswith("https://schema.org") + } # Transfer SCHEMA annotations to provenance for s in schema_annotations: @@ -509,9 +513,9 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity: coll_b.add_attributes(coll_b_attribs) # Identify all schema annotations - schema_annotations = dict( - [(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")] - ) + schema_annotations = { + v: value[v] for v in value.keys() if v.startswith("https://schema.org") + } # Transfer SCHEMA annotations to provenance for s in schema_annotations: @@ -571,7 +575,7 @@ def declare_artefact(self, value: Any) -> ProvEntity: self.research_object.add_uri(entity.identifier.uri) return entity - if isinstance(value, (str, str)): + if isinstance(value, str): (entity, _) = self.declare_string(value) return entity @@ -734,35 +738,39 @@ def generate_output_prov( entity, process_run_id, timestamp, None, {"prov:role": role} ) - def prospective_prov(self, job: JobsType) -> None: + def prospective_prov(self, job: JobsType, process: Process) -> None: """Create prospective prov recording as wfdesc prov:Plan.""" + prov_items: ProvType = { + PROV_TYPE: WFDESC["Workflow"] if isinstance(job, WorkflowJob) else WFDESC["Process"], + "prov:type": PROV["Plan"], + "prov:label": "Prospective provenance", + } + if "doc" in process.tool: + prov_items[SCHEMA["description"]] = process.tool["doc"] + if "label" in process.tool: + prov_items[SCHEMA["name"]] = process.tool["label"] + # # TypeError: unhashable type: 'list' + # if "intent" in process.tool: + # prov_items[SCHEMA["featureList"]] = convert_to_dict(process.tool["intent"]) + self.document.entity("wf:main", prov_items) if not isinstance(job, WorkflowJob): - # direct command line tool execution - self.document.entity( - "wf:main", - { - PROV_TYPE: WFDESC["Process"], - "prov:type": PROV["Plan"], - "prov:label": "Prospective provenance", - }, - ) return - self.document.entity( - "wf:main", - { - PROV_TYPE: WFDESC["Workflow"], - "prov:type": PROV["Plan"], - "prov:label": "Prospective provenance", - }, - ) - for step in job.steps: stepnametemp = "wf:main/" + str(step.name)[5:] stepname = urllib.parse.quote(stepnametemp, safe=":/,#") + provstep_items: ProvType = { + PROV_TYPE: WFDESC["Process"], + "prov:type": PROV["Plan"], + } + # WorkflowStep level annotations + if "doc" in step.tool: + provstep_items[SCHEMA["description"]] = step.tool["doc"] + if "label" in step.tool: + provstep_items[SCHEMA["name"]] = step.tool["label"] provstep = self.document.entity( stepname, - {PROV_TYPE: WFDESC["Process"], "prov:type": PROV["Plan"]}, + provstep_items, ) self.document.entity( "wf:main", diff --git a/cwltool/singularity.py b/cwltool/singularity.py index 2f590a140..1277092e3 100644 --- a/cwltool/singularity.py +++ b/cwltool/singularity.py @@ -369,7 +369,7 @@ def add_writable_file_volume( if self.inplace_update: try: os.link(os.path.realpath(volume.resolved), host_outdir_tgt) - except os.error: + except OSError: shutil.copy(volume.resolved, host_outdir_tgt) else: shutil.copy(volume.resolved, host_outdir_tgt) diff --git a/tests/test_provenance.py b/tests/test_provenance.py index 5dbe27d7c..d2cb7e0db 100644 --- a/tests/test_provenance.py +++ b/tests/test_provenance.py @@ -34,9 +34,11 @@ def cwltool(tmp_path: Path, *args: Any) -> Path: + out_folder = tmp_path / "out" + out_folder.mkdir() prov_folder = tmp_path / "provenance" prov_folder.mkdir() - new_args = ["--provenance", str(prov_folder)] + new_args = ["--provenance", str(prov_folder), "--outdir", str(out_folder)] new_args.extend(args) # Run within a temporary directory to not pollute git checkout tmp_dir = tmp_path / "cwltool-run"