Skip to content

Commit

Permalink
Merge pull request #71 from con/gh-68b
Browse files Browse the repository at this point in the history
Cache directory fingerprint as a XORed hash of file fingerprints
  • Loading branch information
yarikoptic authored Feb 22, 2022
2 parents 826d3cb + a5f1e86 commit 3c4b426
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 4 deletions.
27 changes: 23 additions & 4 deletions src/fscacher/cache.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from collections import deque, namedtuple
from functools import wraps
from hashlib import md5
from inspect import Parameter, signature
import logging
import os
import os.path as op
import shutil
import sys
import time
import appdirs
import joblib
Expand Down Expand Up @@ -204,12 +206,19 @@ def to_tuple(self):
class DirFingerprint:
def __init__(self):
self.last_modified = None
self.tree_fprints = {}
self.hash = None

def add_file(self, path, fprint: FileFingerprint):
self.tree_fprints[path] = fprint
if self.last_modified is None or self.last_modified < fprint.mtime_ns:
fprint_hash = md5(
ascii((str(path), fprint.to_tuple())).encode("us-ascii")
).digest()
if self.hash is None:
self.hash = fprint_hash
self.last_modified = fprint.mtime_ns
else:
self.hash = xor_bytes(self.hash, fprint_hash)
if self.last_modified < fprint.mtime_ns:
self.last_modified = fprint.mtime_ns

def modified_in_window(self, min_dtime):
if self.last_modified is None:
Expand All @@ -218,4 +227,14 @@ def modified_in_window(self, min_dtime):
return abs(time.time() - self.last_modified * 1e-9) < min_dtime

def to_tuple(self):
return sum(sorted(self.tree_fprints.items()), ())
if self.hash is None:
return (None,)
else:
return (self.hash.hex(),)


def xor_bytes(b1: bytes, b2: bytes) -> bytes:
length = max(len(b1), len(b2))
i1 = int.from_bytes(b1, sys.byteorder)
i2 = int.from_bytes(b2, sys.byteorder)
return (i1 ^ i2).to_bytes(length, sys.byteorder)
27 changes: 27 additions & 0 deletions src/fscacher/tests/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import time
import pytest
from .. import PersistentCache
from ..cache import DirFingerprint, FileFingerprint

platform_system = platform.system().lower()
on_windows = platform_system == "windows"
Expand Down Expand Up @@ -398,3 +399,29 @@ def memoread(filepath, arg, kwarg=None):
assert len(calls) == ncalls + 1
assert memoread(arg=1, filepath=path) == "content"
assert len(calls) == ncalls + 1


def test_dir_fingerprint_order_irrelevant(tmp_path):
start = time.time()
file1 = tmp_path / "apple.txt"
file1.write_text("Apple\n")
os.utime(file1, (start - 1, start - 1))
file2 = tmp_path / "banana.txt"
file2.write_text("This is test text.\n")
os.utime(file2, (start - 2, start - 2))
file3 = tmp_path / "coconut.txt"
file3.write_text("Lorem ipsum dolor sit amet, consectetur adipisicing elit\n")
os.utime(file3, (start - 3, start - 3))
df_tuples = []
for file_list in [
[file1, file2, file3],
[file3, file2, file1],
[file2, file1, file3],
]:
dprint = DirFingerprint()
for f in file_list:
fprint = FileFingerprint.from_stat(os.stat(f))
dprint.add_file(f, fprint)
df_tuples.append(dprint.to_tuple())
for i in range(1, len(df_tuples)):
assert df_tuples[0] == df_tuples[i]
16 changes: 16 additions & 0 deletions src/fscacher/tests/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest
from ..cache import xor_bytes


@pytest.mark.parametrize(
"b1,b2,r",
[
(b"\x12", b"\x34", b"\x26"),
(b"\0\x12", b"\0\x34", b"\0\x26"),
(b"\x12\0", b"\x34\0", b"\x26\0"),
(b"\x12\xAB", b"\x34", b"\x26\xAB"),
(b"\x12\xAB", b"\x34\xCD", b"\x26\x66"),
],
)
def test_xor_bytes(b1: bytes, b2: bytes, r: bytes) -> None:
assert xor_bytes(b1, b2) == r

0 comments on commit 3c4b426

Please sign in to comment.