diff --git a/cwltool/context.py b/cwltool/context.py index 237a90968..bb281fd88 100644 --- a/cwltool/context.py +++ b/cwltool/context.py @@ -183,6 +183,8 @@ def __init__(self, kwargs: Optional[dict[str, Any]] = None) -> None: self.orcid: str = "" self.cwl_full_name: str = "" self.process_run_id: Optional[str] = None + self.prov_host: bool = False + self.prov_user: bool = False self.prov_obj: Optional[ProvenanceProfile] = None self.mpi_config: MpiConfig = MpiConfig() self.default_stdout: Optional[Union[IO[bytes], TextIO]] = None diff --git a/cwltool/cwlprov/provenance_profile.py b/cwltool/cwlprov/provenance_profile.py index 5a0f12f94..e8538e51b 100644 --- a/cwltool/cwlprov/provenance_profile.py +++ b/cwltool/cwlprov/provenance_profile.py @@ -27,7 +27,6 @@ ORE, PROVENANCE, RO, - SCHEMA, SHA1, SHA256, TEXT_PLAIN, @@ -144,25 +143,10 @@ def generate_prov_doc(self) -> tuple[str, ProvDocument]: # .. but we always know cwltool was launched (directly or indirectly) # by a user account, as cwltool is a command line tool account = self.document.agent(ACCOUNT_UUID) - if self.orcid or self.full_name: - person: dict[Union[str, Identifier], Any] = { - PROV_TYPE: PROV["Person"], - "prov:type": SCHEMA["Person"], - } - if self.full_name: - person["prov:label"] = self.full_name - person["foaf:name"] = self.full_name - person["schema:name"] = self.full_name - else: - # TODO: Look up name from ORCID API? - pass - agent = self.document.agent(self.orcid or uuid.uuid4().urn, person) - self.document.actedOnBehalfOf(account, agent) - else: - if self.host_provenance: - self.research_object.host_provenance(self.document) - if self.user_provenance: - self.research_object.user_provenance(self.document) + if self.host_provenance: + self.research_object.host_provenance(self.document) + if self.user_provenance or self.orcid or self.full_name: + self.research_object.user_provenance(self.document) # The execution of cwltool wfengine = self.document.agent( self.engine_uuid, diff --git a/cwltool/cwlprov/ro.py b/cwltool/cwlprov/ro.py index 39a06a473..f58919a6b 100644 --- a/cwltool/cwlprov/ro.py +++ b/cwltool/cwlprov/ro.py @@ -37,6 +37,7 @@ METADATA, ORCID, PROVENANCE, + SCHEMA, SHA1, SHA256, SHA512, @@ -184,12 +185,14 @@ def user_provenance(self, document: ProvDocument) -> None: user = document.agent( self.orcid or USER_UUID, - { - provM.PROV_TYPE: provM.PROV["Person"], - provM.PROV_LABEL: self.full_name, - FOAF["name"]: self.full_name, - FOAF["account"]: account, - }, + [ + (provM.PROV_TYPE, SCHEMA["Person"]), + (provM.PROV_TYPE, provM.PROV["Person"]), + (provM.PROV_LABEL, self.full_name), + (FOAF["name"], self.full_name), + (FOAF["account"], account), + (SCHEMA["name"], self.full_name), + ], ) # cwltool may be started on the shell (directly by user), # by shell script (indirectly by user) diff --git a/cwltool/executors.py b/cwltool/executors.py index eef5e857b..33198d854 100644 --- a/cwltool/executors.py +++ b/cwltool/executors.py @@ -195,8 +195,11 @@ def run_jobs( if not isinstance(process, Workflow) and runtime_context.research_obj is not None: process.provenance_object = runtime_context.research_obj.initialize_provenance( full_name=runtime_context.cwl_full_name, - host_provenance=False, - user_provenance=False, + # following are only set from main when directly command line tool + # when nested in a workflow, they should be disabled since they would + # already have been provided/initialized by the parent workflow prov-obj + host_provenance=runtime_context.prov_host, + user_provenance=runtime_context.prov_user, orcid=runtime_context.orcid, # single tool execution, so RO UUID = wf UUID = tool UUID run_uuid=runtime_context.research_obj.ro_uuid, diff --git a/cwltool/main.py b/cwltool/main.py index 17ccb11ce..2649d0a77 100755 --- a/cwltool/main.py +++ b/cwltool/main.py @@ -1060,6 +1060,11 @@ def main( loadingContext = setup_loadingContext(loadingContext, runtimeContext, args) + if loadingContext.research_obj: + # early forward parameters required for a single command line tool + runtimeContext.prov_host = loadingContext.host_provenance + runtimeContext.prov_user = loadingContext.user_provenance + uri, tool_file_uri = resolve_tool_uri( args.workflow, resolver=loadingContext.resolver, diff --git a/tests/test_provenance.py b/tests/test_provenance.py index 94ed62603..e89660b43 100644 --- a/tests/test_provenance.py +++ b/tests/test_provenance.py @@ -32,22 +32,23 @@ SCHEMA = Namespace("http://schema.org/") CWLPROV = Namespace("https://w3id.org/cwl/prov#") OA = Namespace("http://www.w3.org/ns/oa#") +FOAF = Namespace("http://xmlns.com/foaf/0.1/") TEST_ORCID = "https://orcid.org/0000-0003-4862-3349" -def cwltool(tmp_path: Path, *args: Any) -> Path: +def cwltool(tmp_path: Path, *args: Any, with_orcid: bool = False) -> Path: prov_folder = tmp_path / "provenance" prov_folder.mkdir() new_args = [ "--enable-user-provenance", "--enable-host-provenance", - "--orcid", - TEST_ORCID, "--provenance", str(prov_folder), ] + if with_orcid: + new_args.extend(["--orcid", TEST_ORCID]) new_args.extend(args) # Run within a temporary directory to not pollute git checkout tmp_dir = tmp_path / "cwltool-run" @@ -59,61 +60,81 @@ def cwltool(tmp_path: Path, *args: Any) -> Path: @needs_docker -def test_hello_workflow(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_hello_workflow(tmp_path: Path, with_orcid: bool) -> None: check_provenance( cwltool( tmp_path, get_data("tests/wf/hello-workflow.cwl"), "--usermessage", "Hello workflow", - ) + with_orcid=with_orcid, + ), + with_orcid=with_orcid, ) @needs_docker -def test_hello_single_tool(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_hello_single_tool(tmp_path: Path, with_orcid: bool) -> None: check_provenance( cwltool( tmp_path, get_data("tests/wf/hello_single_tool.cwl"), "--message", "Hello tool", + with_orcid=with_orcid, ), single_tool=True, + with_orcid=with_orcid, ) @needs_docker -def test_revsort_workflow(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_revsort_workflow(tmp_path: Path, with_orcid: bool) -> None: folder = cwltool( tmp_path, get_data("tests/wf/revsort.cwl"), get_data("tests/wf/revsort-job.json"), + with_orcid=with_orcid, ) check_output_object(folder) - check_provenance(folder) + check_provenance(folder, with_orcid=with_orcid) @needs_docker -def test_revsort_workflow_shortcut(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_revsort_workflow_shortcut(tmp_path: Path, with_orcid: bool) -> None: """Confirm that using 'cwl:tool' shortcut still snapshots the CWL files.""" folder = cwltool( tmp_path, get_data("tests/wf/revsort-job-shortcut.json"), + with_orcid=with_orcid, ) check_output_object(folder) - check_provenance(folder) + check_provenance(folder, with_orcid=with_orcid) assert not (folder / "snapshot" / "revsort-job-shortcut.json").exists() assert len(list((folder / "snapshot").iterdir())) == 4 @needs_docker -def test_nested_workflow(tmp_path: Path) -> None: - check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True) +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_nested_workflow(tmp_path: Path, with_orcid: bool) -> None: + check_provenance( + cwltool( + tmp_path, + get_data("tests/wf/nested.cwl"), + with_orcid=with_orcid, + ), + nested=True, + with_orcid=with_orcid, + ) @needs_docker -def test_secondary_files_implicit(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_secondary_files_implicit(tmp_path: Path, with_orcid: bool) -> None: file1 = tmp_path / "foo1.txt" file1idx = tmp_path / "foo1.txt.idx" @@ -123,13 +144,20 @@ def test_secondary_files_implicit(tmp_path: Path) -> None: f.write("bar") # secondary will be picked up by .idx - folder = cwltool(tmp_path, get_data("tests/wf/sec-wf.cwl"), "--file1", str(file1)) - check_provenance(folder, secondary_files=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/sec-wf.cwl"), + "--file1", + str(file1), + with_orcid=with_orcid, + ) + check_provenance(folder, secondary_files=True, with_orcid=with_orcid) check_secondary_files(folder) @needs_docker -def test_secondary_files_explicit(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_secondary_files_explicit(tmp_path: Path, with_orcid: bool) -> None: # Deliberately do NOT have common basename or extension file1dir = tmp_path / "foo" file1dir.mkdir() @@ -164,22 +192,33 @@ def test_secondary_files_explicit(tmp_path: Path) -> None: j = json.dumps(job, ensure_ascii=True) fp.write(j.encode("ascii")) - folder = cwltool(tmp_path, get_data("tests/wf/sec-wf.cwl"), str(jobJson)) - check_provenance(folder, secondary_files=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/sec-wf.cwl"), + str(jobJson), + with_orcid=with_orcid, + ) + check_provenance(folder, secondary_files=True, with_orcid=with_orcid) check_secondary_files(folder) @needs_docker -def test_secondary_files_output(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_secondary_files_output(tmp_path: Path, with_orcid: bool) -> None: # secondary will be picked up by .idx - folder = cwltool(tmp_path, get_data("tests/wf/sec-wf-out.cwl")) - check_provenance(folder, secondary_files=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/sec-wf-out.cwl"), + with_orcid=with_orcid, + ) + check_provenance(folder, secondary_files=True, with_orcid=with_orcid) # Skipped, not the same secondary files as above # self.check_secondary_files() @needs_docker -def test_directory_workflow(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_directory_workflow(tmp_path: Path, with_orcid: bool) -> None: dir2 = tmp_path / "dir2" dir2.mkdir() sha1 = { @@ -195,8 +234,14 @@ def test_directory_workflow(tmp_path: Path) -> None: with open(dir2 / x, "w", encoding="ascii") as f: f.write(x) - folder = cwltool(tmp_path, get_data("tests/wf/directory.cwl"), "--dir", str(dir2)) - check_provenance(folder, directory=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/directory.cwl"), + "--dir", + str(dir2), + with_orcid=with_orcid, + ) + check_provenance(folder, directory=True, with_orcid=with_orcid) # Output should include ls stdout of filenames a b c on each line file_list = ( @@ -219,10 +264,12 @@ def test_directory_workflow(tmp_path: Path) -> None: @needs_docker -def test_no_data_files(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_no_data_files(tmp_path: Path, with_orcid: bool) -> None: folder = cwltool( tmp_path, get_data("tests/wf/conditional_step_no_inputs.cwl"), + with_orcid=with_orcid, ) check_bagit(folder) @@ -273,6 +320,7 @@ def check_provenance( single_tool: bool = False, directory: bool = False, secondary_files: bool = False, + with_orcid: bool = False, ) -> None: check_folders(base_path) check_bagit(base_path) @@ -283,6 +331,7 @@ def check_provenance( single_tool=single_tool, directory=directory, secondary_files=secondary_files, + with_orcid=with_orcid, ) @@ -473,6 +522,7 @@ def check_prov( single_tool: bool = False, directory: bool = False, secondary_files: bool = False, + with_orcid: bool = False, ) -> None: prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt" assert prov_file.is_file(), f"Can't find {prov_file}" @@ -512,10 +562,20 @@ def check_prov( ) in g, "Engine not declared as SoftwareAgent" # run should be associated to the user + accounts = set(g.subjects(RDF.type, FOAF.OnlineAccount)) + assert len(accounts) == 1 + account = accounts.pop() people = set(g.subjects(RDF.type, SCHEMA.Person)) assert len(people) == 1, "Can't find associated person in workflow run" person = people.pop() - assert person == URIRef(TEST_ORCID) + if with_orcid: + assert person == URIRef(TEST_ORCID) + else: + account_names = set(g.objects(account, FOAF.accountName)) + assert len(account_names) == 1 + account_name = account_names.pop() + machine_user = provenance._whoami()[0] + assert account_name.value == machine_user # find the random UUID assigned to cwltool tool_agents = set(g.subjects(RDF.type, PROV.SoftwareAgent)) @@ -528,9 +588,8 @@ def check_prov( agents.remove(engine) # the main tool remain_agents = agents - tool_agents assert len(remain_agents) == 1 - cwltool_agent = remain_agents.pop() assert ( - cwltool_agent, + account, PROV.actedOnBehalfOf, person, ) in g, "Association of cwltool agent acting for user is missing"