Skip to content

Commit

Permalink
test with/without orcid prov + fix missing user prov when running Com…
Browse files Browse the repository at this point in the history
…mandLineTool directly
  • Loading branch information
fmigneault committed Dec 12, 2024
1 parent ea2a0b9 commit 78ef52d
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 56 deletions.
2 changes: 2 additions & 0 deletions cwltool/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ def __init__(self, kwargs: Optional[dict[str, Any]] = None) -> None:
self.orcid: str = ""
self.cwl_full_name: str = ""
self.process_run_id: Optional[str] = None
self.prov_host: bool = False
self.prov_user: bool = False
self.prov_obj: Optional[ProvenanceProfile] = None
self.mpi_config: MpiConfig = MpiConfig()
self.default_stdout: Optional[Union[IO[bytes], TextIO]] = None
Expand Down
24 changes: 4 additions & 20 deletions cwltool/cwlprov/provenance_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
ORE,
PROVENANCE,
RO,
SCHEMA,
SHA1,
SHA256,
TEXT_PLAIN,
Expand Down Expand Up @@ -144,25 +143,10 @@ def generate_prov_doc(self) -> tuple[str, ProvDocument]:
# .. but we always know cwltool was launched (directly or indirectly)
# by a user account, as cwltool is a command line tool
account = self.document.agent(ACCOUNT_UUID)
if self.orcid or self.full_name:
person: dict[Union[str, Identifier], Any] = {
PROV_TYPE: PROV["Person"],
"prov:type": SCHEMA["Person"],
}
if self.full_name:
person["prov:label"] = self.full_name
person["foaf:name"] = self.full_name
person["schema:name"] = self.full_name
else:
# TODO: Look up name from ORCID API?
pass
agent = self.document.agent(self.orcid or uuid.uuid4().urn, person)
self.document.actedOnBehalfOf(account, agent)
else:
if self.host_provenance:
self.research_object.host_provenance(self.document)
if self.user_provenance:
self.research_object.user_provenance(self.document)
if self.host_provenance:
self.research_object.host_provenance(self.document)

Check warning on line 147 in cwltool/cwlprov/provenance_profile.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/provenance_profile.py#L147

Added line #L147 was not covered by tests
if self.user_provenance or self.orcid or self.full_name:
self.research_object.user_provenance(self.document)

Check warning on line 149 in cwltool/cwlprov/provenance_profile.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/provenance_profile.py#L149

Added line #L149 was not covered by tests
# The execution of cwltool
wfengine = self.document.agent(
self.engine_uuid,
Expand Down
15 changes: 9 additions & 6 deletions cwltool/cwlprov/ro.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
METADATA,
ORCID,
PROVENANCE,
SCHEMA,
SHA1,
SHA256,
SHA512,
Expand Down Expand Up @@ -184,12 +185,14 @@ def user_provenance(self, document: ProvDocument) -> None:

user = document.agent(
self.orcid or USER_UUID,
{
provM.PROV_TYPE: provM.PROV["Person"],
provM.PROV_LABEL: self.full_name,
FOAF["name"]: self.full_name,
FOAF["account"]: account,
},
[
(provM.PROV_TYPE, SCHEMA["Person"]),
(provM.PROV_TYPE, provM.PROV["Person"]),
(provM.PROV_LABEL, self.full_name),
(FOAF["name"], self.full_name),
(FOAF["account"], account),
(SCHEMA["name"], self.full_name),
],
)
# cwltool may be started on the shell (directly by user),
# by shell script (indirectly by user)
Expand Down
7 changes: 5 additions & 2 deletions cwltool/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,11 @@ def run_jobs(
if not isinstance(process, Workflow) and runtime_context.research_obj is not None:
process.provenance_object = runtime_context.research_obj.initialize_provenance(
full_name=runtime_context.cwl_full_name,
host_provenance=False,
user_provenance=False,
# following are only set from main when directly command line tool
# when nested in a workflow, they should be disabled since they would
# already have been provided/initialized by the parent workflow prov-obj
host_provenance=runtime_context.prov_host,
user_provenance=runtime_context.prov_user,
orcid=runtime_context.orcid,
# single tool execution, so RO UUID = wf UUID = tool UUID
run_uuid=runtime_context.research_obj.ro_uuid,
Expand Down
5 changes: 5 additions & 0 deletions cwltool/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,11 @@ def main(

loadingContext = setup_loadingContext(loadingContext, runtimeContext, args)

if loadingContext.research_obj:
# early forward parameters required for a single command line tool
runtimeContext.prov_host = loadingContext.host_provenance
runtimeContext.prov_user = loadingContext.user_provenance

uri, tool_file_uri = resolve_tool_uri(
args.workflow,
resolver=loadingContext.resolver,
Expand Down
115 changes: 87 additions & 28 deletions tests/test_provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,22 +32,23 @@
SCHEMA = Namespace("http://schema.org/")
CWLPROV = Namespace("https://w3id.org/cwl/prov#")
OA = Namespace("http://www.w3.org/ns/oa#")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")


TEST_ORCID = "https://orcid.org/0000-0003-4862-3349"


def cwltool(tmp_path: Path, *args: Any) -> Path:
def cwltool(tmp_path: Path, *args: Any, with_orcid: bool = False) -> Path:
prov_folder = tmp_path / "provenance"
prov_folder.mkdir()
new_args = [
"--enable-user-provenance",
"--enable-host-provenance",
"--orcid",
TEST_ORCID,
"--provenance",
str(prov_folder),
]
if with_orcid:
new_args.extend(["--orcid", TEST_ORCID])
new_args.extend(args)
# Run within a temporary directory to not pollute git checkout
tmp_dir = tmp_path / "cwltool-run"
Expand All @@ -59,61 +60,81 @@ def cwltool(tmp_path: Path, *args: Any) -> Path:


@needs_docker
def test_hello_workflow(tmp_path: Path) -> None:
@pytest.mark.parametrize("with_orcid", [True, False])
def test_hello_workflow(tmp_path: Path, with_orcid: bool) -> None:
check_provenance(
cwltool(
tmp_path,
get_data("tests/wf/hello-workflow.cwl"),
"--usermessage",
"Hello workflow",
)
with_orcid=with_orcid,
),
with_orcid=with_orcid,
)


@needs_docker
def test_hello_single_tool(tmp_path: Path) -> None:
@pytest.mark.parametrize("with_orcid", [True, False])
def test_hello_single_tool(tmp_path: Path, with_orcid: bool) -> None:
check_provenance(
cwltool(
tmp_path,
get_data("tests/wf/hello_single_tool.cwl"),
"--message",
"Hello tool",
with_orcid=with_orcid,
),
single_tool=True,
with_orcid=with_orcid,
)


@needs_docker
def test_revsort_workflow(tmp_path: Path) -> None:
@pytest.mark.parametrize("with_orcid", [True, False])
def test_revsort_workflow(tmp_path: Path, with_orcid: bool) -> None:
folder = cwltool(
tmp_path,
get_data("tests/wf/revsort.cwl"),
get_data("tests/wf/revsort-job.json"),
with_orcid=with_orcid,
)
check_output_object(folder)
check_provenance(folder)
check_provenance(folder, with_orcid=with_orcid)


@needs_docker
def test_revsort_workflow_shortcut(tmp_path: Path) -> None:
@pytest.mark.parametrize("with_orcid", [True, False])
def test_revsort_workflow_shortcut(tmp_path: Path, with_orcid: bool) -> None:
"""Confirm that using 'cwl:tool' shortcut still snapshots the CWL files."""
folder = cwltool(
tmp_path,
get_data("tests/wf/revsort-job-shortcut.json"),
with_orcid=with_orcid,
)
check_output_object(folder)
check_provenance(folder)
check_provenance(folder, with_orcid=with_orcid)
assert not (folder / "snapshot" / "revsort-job-shortcut.json").exists()
assert len(list((folder / "snapshot").iterdir())) == 4


@needs_docker
def test_nested_workflow(tmp_path: Path) -> None:
check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True)
@pytest.mark.parametrize("with_orcid", [True, False])
def test_nested_workflow(tmp_path: Path, with_orcid: bool) -> None:
check_provenance(
cwltool(
tmp_path,
get_data("tests/wf/nested.cwl"),
with_orcid=with_orcid,
),
nested=True,
with_orcid=with_orcid,
)


@needs_docker
def test_secondary_files_implicit(tmp_path: Path) -> None:
@pytest.mark.parametrize("with_orcid", [True, False])
def test_secondary_files_implicit(tmp_path: Path, with_orcid: bool) -> None:
file1 = tmp_path / "foo1.txt"
file1idx = tmp_path / "foo1.txt.idx"

Expand All @@ -123,13 +144,20 @@ def test_secondary_files_implicit(tmp_path: Path) -> None:
f.write("bar")

# secondary will be picked up by .idx
folder = cwltool(tmp_path, get_data("tests/wf/sec-wf.cwl"), "--file1", str(file1))
check_provenance(folder, secondary_files=True)
folder = cwltool(
tmp_path,
get_data("tests/wf/sec-wf.cwl"),
"--file1",
str(file1),
with_orcid=with_orcid,
)
check_provenance(folder, secondary_files=True, with_orcid=with_orcid)
check_secondary_files(folder)


@needs_docker
def test_secondary_files_explicit(tmp_path: Path) -> None:
@pytest.mark.parametrize("with_orcid", [True, False])
def test_secondary_files_explicit(tmp_path: Path, with_orcid: bool) -> None:
# Deliberately do NOT have common basename or extension
file1dir = tmp_path / "foo"
file1dir.mkdir()
Expand Down Expand Up @@ -164,22 +192,33 @@ def test_secondary_files_explicit(tmp_path: Path) -> None:
j = json.dumps(job, ensure_ascii=True)
fp.write(j.encode("ascii"))

folder = cwltool(tmp_path, get_data("tests/wf/sec-wf.cwl"), str(jobJson))
check_provenance(folder, secondary_files=True)
folder = cwltool(
tmp_path,
get_data("tests/wf/sec-wf.cwl"),
str(jobJson),
with_orcid=with_orcid,
)
check_provenance(folder, secondary_files=True, with_orcid=with_orcid)
check_secondary_files(folder)


@needs_docker
def test_secondary_files_output(tmp_path: Path) -> None:
@pytest.mark.parametrize("with_orcid", [True, False])
def test_secondary_files_output(tmp_path: Path, with_orcid: bool) -> None:
# secondary will be picked up by .idx
folder = cwltool(tmp_path, get_data("tests/wf/sec-wf-out.cwl"))
check_provenance(folder, secondary_files=True)
folder = cwltool(
tmp_path,
get_data("tests/wf/sec-wf-out.cwl"),
with_orcid=with_orcid,
)
check_provenance(folder, secondary_files=True, with_orcid=with_orcid)
# Skipped, not the same secondary files as above
# self.check_secondary_files()


@needs_docker
def test_directory_workflow(tmp_path: Path) -> None:
@pytest.mark.parametrize("with_orcid", [True, False])
def test_directory_workflow(tmp_path: Path, with_orcid: bool) -> None:
dir2 = tmp_path / "dir2"
dir2.mkdir()
sha1 = {
Expand All @@ -195,8 +234,14 @@ def test_directory_workflow(tmp_path: Path) -> None:
with open(dir2 / x, "w", encoding="ascii") as f:
f.write(x)

folder = cwltool(tmp_path, get_data("tests/wf/directory.cwl"), "--dir", str(dir2))
check_provenance(folder, directory=True)
folder = cwltool(
tmp_path,
get_data("tests/wf/directory.cwl"),
"--dir",
str(dir2),
with_orcid=with_orcid,
)
check_provenance(folder, directory=True, with_orcid=with_orcid)

# Output should include ls stdout of filenames a b c on each line
file_list = (
Expand All @@ -219,10 +264,12 @@ def test_directory_workflow(tmp_path: Path) -> None:


@needs_docker
def test_no_data_files(tmp_path: Path) -> None:
@pytest.mark.parametrize("with_orcid", [True, False])
def test_no_data_files(tmp_path: Path, with_orcid: bool) -> None:
folder = cwltool(
tmp_path,
get_data("tests/wf/conditional_step_no_inputs.cwl"),
with_orcid=with_orcid,
)
check_bagit(folder)

Expand Down Expand Up @@ -273,6 +320,7 @@ def check_provenance(
single_tool: bool = False,
directory: bool = False,
secondary_files: bool = False,
with_orcid: bool = False,
) -> None:
check_folders(base_path)
check_bagit(base_path)
Expand All @@ -283,6 +331,7 @@ def check_provenance(
single_tool=single_tool,
directory=directory,
secondary_files=secondary_files,
with_orcid=with_orcid,
)


Expand Down Expand Up @@ -473,6 +522,7 @@ def check_prov(
single_tool: bool = False,
directory: bool = False,
secondary_files: bool = False,
with_orcid: bool = False,
) -> None:
prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt"
assert prov_file.is_file(), f"Can't find {prov_file}"
Expand Down Expand Up @@ -512,10 +562,20 @@ def check_prov(
) in g, "Engine not declared as SoftwareAgent"

# run should be associated to the user
accounts = set(g.subjects(RDF.type, FOAF.OnlineAccount))
assert len(accounts) == 1
account = accounts.pop()
people = set(g.subjects(RDF.type, SCHEMA.Person))
assert len(people) == 1, "Can't find associated person in workflow run"
person = people.pop()
assert person == URIRef(TEST_ORCID)
if with_orcid:
assert person == URIRef(TEST_ORCID)
else:
account_names = set(g.objects(account, FOAF.accountName))
assert len(account_names) == 1
account_name = account_names.pop()
machine_user = provenance._whoami()[0]
assert account_name.value == machine_user

# find the random UUID assigned to cwltool
tool_agents = set(g.subjects(RDF.type, PROV.SoftwareAgent))
Expand All @@ -528,9 +588,8 @@ def check_prov(
agents.remove(engine) # the main tool
remain_agents = agents - tool_agents
assert len(remain_agents) == 1
cwltool_agent = remain_agents.pop()
assert (
cwltool_agent,
account,
PROV.actedOnBehalfOf,
person,
) in g, "Association of cwltool agent acting for user is missing"
Expand Down

0 comments on commit 78ef52d

Please sign in to comment.