Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Caching of file-set hashes by local path and mtimes #700

Merged
merged 36 commits into from
Mar 17, 2024
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
45117ef
added code to handle "locally-persistent-ids"
tclose Feb 24, 2024
2b7ca50
implemented persistent hash cache to avoid rehashing files
tclose Feb 24, 2024
04b95ff
touched up persistent_hash_cache test
tclose Feb 24, 2024
0c865f4
replaced Cache({}) with Cache() to match new proper class
tclose Feb 24, 2024
3b3fdb7
upped resolution of mtime to nanoseconds
tclose Feb 24, 2024
81a5108
added sleep to various tests to ensure file mtimes are different
tclose Feb 24, 2024
0c4b179
added more sleeps to ensure mtimes of input files are different in tests
tclose Feb 24, 2024
615d590
debugged setting hash cache via env var and added clean up of directory
tclose Feb 24, 2024
55b660e
mock mtime writing instead of adding sleeps to ensure mtimes are diff…
tclose Feb 24, 2024
5d51736
undid overzealous black
tclose Feb 24, 2024
0421f85
added missing import
tclose Feb 24, 2024
a864b32
Adds platformdirs dependency and use it to store the hash cache within
tclose Feb 24, 2024
05ca695
added unittests to hit exceptions in persistentcache init
tclose Feb 24, 2024
52ef03f
added mkdir to location converter
tclose Feb 24, 2024
0216236
debugged mkdir of persistent cache
tclose Feb 24, 2024
bad261b
bug fixes in persistentcache location init
tclose Feb 24, 2024
2fbee2b
Revert "mock mtime writing instead of adding sleeps to ensure mtimes …
tclose Feb 24, 2024
91948f0
skip lock files in directory clean up
tclose Feb 24, 2024
e058408
fixed clean-up bug
tclose Feb 24, 2024
f1ded7a
added mock import
tclose Feb 24, 2024
bb11067
added another sleep to trigger atime change
tclose Feb 24, 2024
a031ea5
implementing @effigies suggestions
tclose Feb 29, 2024
f2f70a6
added comments and doc strings to explain the use of the persistent c…
tclose Feb 29, 2024
191aa9c
touched up comments
tclose Feb 29, 2024
3076fea
another comment touch up
tclose Feb 29, 2024
a094fbc
touch up comments again
tclose Feb 29, 2024
d27201f
Merge branch 'master' into local-cache-ids
tclose Mar 8, 2024
291f29f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 8, 2024
0a10f6c
added in @djarecka's test for moving file cache locations
tclose Mar 8, 2024
311e3dd
updated cache initialisation
tclose Mar 8, 2024
4827365
switched to use blake2b isntead of blake2s
tclose Mar 8, 2024
b6799b6
[skip ci] deleted already commented-out code
tclose Mar 8, 2024
2bb86fe
additional doc strings for hash cache objects
tclose Mar 8, 2024
1f601e1
added test to see that persistent cache is used in the running of tasks
tclose Mar 16, 2024
7e60c41
moved persistent hash cache within "hash_cache" subdirectory of the p…
tclose Mar 17, 2024
921979c
fixed import issue
tclose Mar 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pydra/engine/specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def _compute_hashes(self) -> ty.Tuple[bytes, ty.Dict[str, bytes]]:
if "container_path" in field.metadata:
continue
inp_dict[field.name] = getattr(self, field.name)
hash_cache = Cache({})
hash_cache = Cache()
field_hashes = {
k: hash_function(v, cache=hash_cache) for k, v in inp_dict.items()
}
Expand Down
2 changes: 2 additions & 0 deletions pydra/engine/submitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .workers import Worker, WORKERS
from .core import is_workflow
from .helpers import get_open_loop, load_and_run_async
from ..utils.hash import PersistentCache

import logging

Expand Down Expand Up @@ -54,6 +55,7 @@ def __call__(self, runnable, cache_locations=None, rerun=False, environment=None
self.loop.run_until_complete(
self.submit_from_call(runnable, rerun, environment)
)
PersistentCache().clean_up()
return runnable.result()

async def submit_from_call(self, runnable, rerun, environment):
Expand Down
103 changes: 103 additions & 0 deletions pydra/engine/tests/test_node_task.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
import os
import shutil
import attr
import typing as ty
import numpy as np
import time
from unittest import mock
from pathlib import Path
import pytest
import time
from fileformats.generic import File
import pydra.mark

from .utils import (
fun_addtwo,
Expand Down Expand Up @@ -306,6 +313,7 @@ def test_task_init_7(tmp_path):
output_dir1 = nn1.output_dir

# changing the content of the file
time.sleep(2) # need the mtime to be different
file2 = tmp_path / "file2.txt"
with open(file2, "w") as f:
f.write("from pydra")
Expand Down Expand Up @@ -1560,3 +1568,98 @@ def test_task_state_cachelocations_updated(plugin, tmp_path):
# both workflows should be run
assert all([dir.exists() for dir in nn.output_dir])
assert all([dir.exists() for dir in nn2.output_dir])


def test_task_files_cachelocations(plugin_dask_opt, tmp_path):
"""
Two identical tasks with provided cache_dir that use file as an input;
the second task has cache_locations and should not recompute the results
"""
cache_dir = tmp_path / "test_task_nostate"
cache_dir.mkdir()
cache_dir2 = tmp_path / "test_task_nostate2"
cache_dir2.mkdir()
input_dir = tmp_path / "input"
input_dir.mkdir()

input1 = input_dir / "input1.txt"
input1.write_text("test")
input2 = input_dir / "input2.txt"
input2.write_text("test")

nn = fun_file(name="NA", filename=input1, cache_dir=cache_dir)
with Submitter(plugin=plugin_dask_opt) as sub:
sub(nn)

nn2 = fun_file(
name="NA", filename=input2, cache_dir=cache_dir2, cache_locations=cache_dir
)
with Submitter(plugin=plugin_dask_opt) as sub:
sub(nn2)

# checking the results
results2 = nn2.result()
assert results2.output.out == "test"

# checking if the second task didn't run the interface again
assert nn.output_dir.exists()
assert not nn2.output_dir.exists()


class OverriddenContentsFile(File):
"""A class for testing purposes, to that enables you to override the contents
of the file to allow you to check whether the persistent cache is used."""

def __init__(
self,
fspaths: ty.Iterator[Path],
contents: ty.Optional[bytes] = None,
metadata: ty.Dict[str, ty.Any] = None,
):
super().__init__(fspaths, metadata=metadata)
self._contents = contents

def byte_chunks(self, **kwargs) -> ty.Generator[ty.Tuple[str, bytes], None, None]:
if self._contents is not None:
yield (str(self.fspath), iter([self._contents]))
else:
yield from super().byte_chunks(**kwargs)

@property
def contents(self):
if self._contents is not None:
return self._contents
return super().contents


def test_task_files_persistentcache(tmp_path):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tclose - where are you setting the persistent cach path? i.e. PYDRA_HASH_CACHE

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At @ghisvail's suggestion, the hash cache is stored in a system-dependent user cache directory using platformdirs.user_cache_dir by default (e.g. /Users/<username>/Library/Caches/pydra/<version-number> on MacOS).

I have just tweaked the code so that it is now put in a hashes subdirectory of that user cache dir (accessible in the pydra.utils.user_cache_dir variable) just in case any other cache data needs to be stored at some point in the future.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, got it! Sorry I missed that. I've just realized that I made a typo when I was setting PYDRA_HASH_CACHE and that's why it was not saving the hashes there, and was confused where this is being saved..

"""
Two identical tasks with provided cache_dir that use file as an input;
the second task has cache_locations and should not recompute the results
"""
test_file_path = tmp_path / "test_file.txt"
test_file_path.write_bytes(b"foo")
cache_dir = tmp_path / "cache-dir"
cache_dir.mkdir()
test_file = OverriddenContentsFile(test_file_path)

@pydra.mark.task
def read_contents(x: OverriddenContentsFile) -> bytes:
return x.contents

assert (
read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
== b"foo"
)
test_file._contents = b"bar"
# should return result from the first run using the persistent cache
assert (
read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
== b"foo"
)
time.sleep(2) # Windows has a 2-second resolution for mtime
test_file_path.touch() # update the mtime to invalidate the persistent cache value
assert (
read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
== b"bar"
) # returns the overridden value
6 changes: 6 additions & 0 deletions pydra/engine/tests/test_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import attrs
from copy import deepcopy
import time

from ..specs import (
BaseSpec,
Expand Down Expand Up @@ -163,6 +164,7 @@ def test_input_file_hash_2(tmp_path):
assert hash1 == hash2

# checking if different content (the same name) affects the hash
time.sleep(2) # ensure mtime is different
file_diffcontent = tmp_path / "in_file_1.txt"
with open(file_diffcontent, "w") as f:
f.write("hi")
Expand Down Expand Up @@ -193,6 +195,7 @@ def test_input_file_hash_2a(tmp_path):
assert hash1 == hash2

# checking if different content (the same name) affects the hash
time.sleep(2) # ensure mtime is different
file_diffcontent = tmp_path / "in_file_1.txt"
with open(file_diffcontent, "w") as f:
f.write("hi")
Expand Down Expand Up @@ -234,6 +237,7 @@ def test_input_file_hash_3(tmp_path):
# assert id(files_hash1["in_file"][filename]) == id(files_hash2["in_file"][filename])

# recreating the file
time.sleep(2) # ensure mtime is different
with open(file, "w") as f:
f.write("hello")

Expand Down Expand Up @@ -288,6 +292,7 @@ def test_input_file_hash_4(tmp_path):
assert hash1 == hash2

# checking if different content (the same name) affects the hash
time.sleep(2) # need the mtime to be different
file_diffcontent = tmp_path / "in_file_1.txt"
with open(file_diffcontent, "w") as f:
f.write("hi")
Expand Down Expand Up @@ -324,6 +329,7 @@ def test_input_file_hash_5(tmp_path):
assert hash1 == hash2

# checking if different content (the same name) affects the hash
time.sleep(2) # ensure mtime is different
file_diffcontent = tmp_path / "in_file_1.txt"
with open(file_diffcontent, "w") as f:
f.write("hi")
Expand Down
Loading
Loading