From 91b87d7a3560254a289dc636fd0ed9dbdbc7eab8 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 18 Feb 2022 11:31:40 -0500 Subject: [PATCH 1/3] Cache directory fingerprint as a XORed hash of file fingerprints --- src/fscacher/cache.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/fscacher/cache.py b/src/fscacher/cache.py index 6bcf801..99e855c 100644 --- a/src/fscacher/cache.py +++ b/src/fscacher/cache.py @@ -1,7 +1,9 @@ from collections import deque, namedtuple from functools import wraps +from hashlib import md5 from inspect import Parameter, signature import logging +from operator import xor import os import os.path as op import shutil @@ -195,12 +197,19 @@ def to_tuple(self): class DirFingerprint: def __init__(self): self.last_modified = None - self.tree_fprints = {} + self.hash_ords = None def add_file(self, path, fprint: FileFingerprint): - self.tree_fprints[path] = fprint - if self.last_modified is None or self.last_modified < fprint.mtime_ns: + fprint_hash = list( + md5(ascii((str(path), fprint.to_tuple())).encode("us-ascii")).digest() + ) + if self.hash_ords is None: + self.hash_ords = fprint_hash self.last_modified = fprint.mtime_ns + else: + self.hash_ords = list(map(xor, self.hash_ords, fprint_hash)) + if self.last_modified < fprint.mtime_ns: + self.last_modified = fprint.mtime_ns def modified_in_window(self, min_dtime): if self.last_modified is None: @@ -209,4 +218,7 @@ def modified_in_window(self, min_dtime): return abs(time.time() - self.last_modified * 1e-9) < min_dtime def to_tuple(self): - return sum(sorted(self.tree_fprints.items()), ()) + if self.hash_ords is None: + return (None,) + else: + return (bytes(self.hash_ords).hex(),) From 7c1ca600e62846b9bf54fb76c6b2501fc67f750a Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 21 Feb 2022 10:05:36 -0500 Subject: [PATCH 2/3] Try to make bytes-XORing faster --- src/fscacher/cache.py | 17 ++++++++++++----- src/fscacher/tests/test_util.py | 16 ++++++++++++++++ 2 files changed, 28 insertions(+), 5 deletions(-) create mode 100644 src/fscacher/tests/test_util.py diff --git a/src/fscacher/cache.py b/src/fscacher/cache.py index 99e855c..b5e9185 100644 --- a/src/fscacher/cache.py +++ b/src/fscacher/cache.py @@ -3,10 +3,10 @@ from hashlib import md5 from inspect import Parameter, signature import logging -from operator import xor import os import os.path as op import shutil +import sys import time import appdirs import joblib @@ -200,14 +200,14 @@ def __init__(self): self.hash_ords = None def add_file(self, path, fprint: FileFingerprint): - fprint_hash = list( - md5(ascii((str(path), fprint.to_tuple())).encode("us-ascii")).digest() - ) + fprint_hash = md5( + ascii((str(path), fprint.to_tuple())).encode("us-ascii") + ).digest() if self.hash_ords is None: self.hash_ords = fprint_hash self.last_modified = fprint.mtime_ns else: - self.hash_ords = list(map(xor, self.hash_ords, fprint_hash)) + self.hash_ords = xor_bytes(self.hash_ords, fprint_hash) if self.last_modified < fprint.mtime_ns: self.last_modified = fprint.mtime_ns @@ -222,3 +222,10 @@ def to_tuple(self): return (None,) else: return (bytes(self.hash_ords).hex(),) + + +def xor_bytes(b1: bytes, b2: bytes) -> bytes: + length = max(len(b1), len(b2)) + i1 = int.from_bytes(b1, sys.byteorder) + i2 = int.from_bytes(b2, sys.byteorder) + return (i1 ^ i2).to_bytes(length, sys.byteorder) diff --git a/src/fscacher/tests/test_util.py b/src/fscacher/tests/test_util.py new file mode 100644 index 0000000..55c3693 --- /dev/null +++ b/src/fscacher/tests/test_util.py @@ -0,0 +1,16 @@ +import pytest +from ..cache import xor_bytes + + +@pytest.mark.parametrize( + "b1,b2,r", + [ + (b"\x12", b"\x34", b"\x26"), + (b"\0\x12", b"\0\x34", b"\0\x26"), + (b"\x12\0", b"\x34\0", b"\x26\0"), + (b"\x12\xAB", b"\x34", b"\x26\xAB"), + (b"\x12\xAB", b"\x34\xCD", b"\x26\x66"), + ], +) +def test_xor_bytes(b1: bytes, b2: bytes, r: bytes) -> None: + assert xor_bytes(b1, b2) == r From a5f1e866b0a633753e896139d4e0609a10d89fb6 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 21 Feb 2022 12:01:03 -0500 Subject: [PATCH 3/3] Test that file order is irrelevant for DirFingerprints --- src/fscacher/cache.py | 12 ++++++------ src/fscacher/tests/test_cache.py | 27 +++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/fscacher/cache.py b/src/fscacher/cache.py index b5e9185..9face1e 100644 --- a/src/fscacher/cache.py +++ b/src/fscacher/cache.py @@ -197,17 +197,17 @@ def to_tuple(self): class DirFingerprint: def __init__(self): self.last_modified = None - self.hash_ords = None + self.hash = None def add_file(self, path, fprint: FileFingerprint): fprint_hash = md5( ascii((str(path), fprint.to_tuple())).encode("us-ascii") ).digest() - if self.hash_ords is None: - self.hash_ords = fprint_hash + if self.hash is None: + self.hash = fprint_hash self.last_modified = fprint.mtime_ns else: - self.hash_ords = xor_bytes(self.hash_ords, fprint_hash) + self.hash = xor_bytes(self.hash, fprint_hash) if self.last_modified < fprint.mtime_ns: self.last_modified = fprint.mtime_ns @@ -218,10 +218,10 @@ def modified_in_window(self, min_dtime): return abs(time.time() - self.last_modified * 1e-9) < min_dtime def to_tuple(self): - if self.hash_ords is None: + if self.hash is None: return (None,) else: - return (bytes(self.hash_ords).hex(),) + return (self.hash.hex(),) def xor_bytes(b1: bytes, b2: bytes) -> bytes: diff --git a/src/fscacher/tests/test_cache.py b/src/fscacher/tests/test_cache.py index a7e1fb5..3f96a37 100644 --- a/src/fscacher/tests/test_cache.py +++ b/src/fscacher/tests/test_cache.py @@ -8,6 +8,7 @@ import time import pytest from .. import PersistentCache +from ..cache import DirFingerprint, FileFingerprint platform_system = platform.system().lower() on_windows = platform_system == "windows" @@ -415,3 +416,29 @@ def memoread(filepath, arg, kwarg=None): assert len(calls) == ncalls + 1 assert memoread(arg=1, filepath=path) == "content" assert len(calls) == ncalls + 1 + + +def test_dir_fingerprint_order_irrelevant(tmp_path): + start = time.time() + file1 = tmp_path / "apple.txt" + file1.write_text("Apple\n") + os.utime(file1, (start - 1, start - 1)) + file2 = tmp_path / "banana.txt" + file2.write_text("This is test text.\n") + os.utime(file2, (start - 2, start - 2)) + file3 = tmp_path / "coconut.txt" + file3.write_text("Lorem ipsum dolor sit amet, consectetur adipisicing elit\n") + os.utime(file3, (start - 3, start - 3)) + df_tuples = [] + for file_list in [ + [file1, file2, file3], + [file3, file2, file1], + [file2, file1, file3], + ]: + dprint = DirFingerprint() + for f in file_list: + fprint = FileFingerprint.from_stat(os.stat(f)) + dprint.add_file(f, fprint) + df_tuples.append(dprint.to_tuple()) + for i in range(1, len(df_tuples)): + assert df_tuples[0] == df_tuples[i]