find_variants and without_revision alf functions

int-brain-lab · Aug 14, 2024 · 7236111 · 7236111
1 parent c7961b4
commit 7236111
Show file tree

Hide file tree

Showing 7 changed files with 211 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,17 @@
 # Changelog
-## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.8.1]
+## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.0]
+This version adds a couple of new ALF functions.
+
+### Added 
+
+- one.alf.io.find_variants allows one to find similar datasets on disk, such as revisions
+- one.alf.files.without_revision returns a file path without the revision folder
+
+### Modified
+
+- one.alf.files.add_uuid_string will now replace a UUID in a filename if one already present.
+
+## [2.8.1]
 
 ### Modified
 

diff --git a/one/__init__.py b/one/__init__.py
@@ -1,2 +1,2 @@
 """The Open Neurophysiology Environment (ONE) API."""
-__version__ = '2.8.1'
+__version__ = '2.9.0'
diff --git a/one/alf/files.py b/one/alf/files.py
@@ -407,9 +407,13 @@ def add_uuid_string(file_path, uuid):
     if isinstance(file_path, str):
         file_path = Path(file_path)
     name_parts = file_path.stem.split('.')
-    if uuid == name_parts[-1]:
-        _logger.warning(f'UUID already found in file name: {file_path.name}: IGNORE')
-        return file_path
+    if spec.is_uuid(name_parts[-1]):
+        *name_parts, old_uuid = name_parts
+        if old_uuid == uuid:
+            _logger.warning(f'UUID already found in file name: {file_path.name}: IGNORE')
+            return file_path
+        else:
+            _logger.debug('Replacing %s with %s in %s', old_uuid, uuid, file_path)
     return file_path.parent.joinpath(f"{'.'.join(name_parts)}.{uuid}{file_path.suffix}")
 
 
@@ -448,13 +452,13 @@ def remove_uuid_string(file_path):
     return file_path
 
 
-def padded_sequence(filepath):
+def padded_sequence(file_path):
     """
     Ensures a file path contains a zero-padded experiment sequence folder.
 
     Parameters
     ----------
-    filepath : str, pathlib.Path, pathlib.PurePath
+    file_path : str, pathlib.Path, pathlib.PurePath
         A session or file path to convert.
 
     Returns
@@ -465,20 +469,45 @@ def padded_sequence(filepath):
 
     Examples
     --------
-    >>> filepath = '/iblrigdata/subject/2023-01-01/1/_ibl_experiment.description.yaml'
-    >>> padded_sequence(filepath)
+    >>> file_path = '/iblrigdata/subject/2023-01-01/1/_ibl_experiment.description.yaml'
+    >>> padded_sequence(file_path)
     pathlib.Path('/iblrigdata/subject/2023-01-01/001/_ibl_experiment.description.yaml')
 
     Supports folders and will not affect already padded paths
 
     >>> session_path = pathlib.PurePosixPath('subject/2023-01-01/001')
-    >>> padded_sequence(filepath)
+    >>> padded_sequence(file_path)
     pathlib.PurePosixPath('subject/2023-01-01/001')
     """
-    if isinstance(filepath, str):
-        filepath = Path(filepath)
-    if (session_path := get_session_path(filepath)) is None:
+    if isinstance(file_path, str):
+        file_path = Path(file_path)
+    if (session_path := get_session_path(file_path)) is None:
         raise ValueError('path must include a valid ALF session path, e.g. subject/YYYY-MM-DD/N')
-    idx = len(filepath.parts) - len(session_path.parts)
+    idx = len(file_path.parts) - len(session_path.parts)
     sequence = str(int(session_path.parts[-1])).zfill(3)  # zero-pad if necessary
-    return filepath.parents[idx].joinpath(sequence, filepath.relative_to(session_path))
+    return file_path.parents[idx].joinpath(sequence, file_path.relative_to(session_path))
+
+
+def without_revision(file_path):
+    """
+    Return file path without a revision folder.
+
+    Parameters
+    ----------
+    file_path : str, pathlib.Path
+        A valid ALF dataset path.
+
+    Returns
+    -------
+    pathlib.Path
+        The input file path without a revision folder.
+
+    Examples
+    --------
+    >>> without_revision('/lab/Subjects/subject/2023-01-01/001/collection/#revision#/obj.attr.ext')
+    Path('/lab/Subjects/subject/2023-01-01/001/collection/obj.attr.ext')
+    """
+    if isinstance(file_path, str):
+        file_path = Path(file_path)
+    *_, collection, revision = folder_parts(file_path.parent)
+    return get_session_path(file_path).joinpath(*filter(None, (collection, file_path.name)))
diff --git a/one/alf/io.py b/one/alf/io.py
@@ -14,6 +14,8 @@
 from fnmatch import fnmatch
 from pathlib import Path
 from typing import Union
+from functools import partial
+from itertools import chain
 import warnings
 
 import numpy as np
@@ -342,7 +344,7 @@ def _ls(alfpath, object=None, **kwargs) -> (list, tuple):
         An ALF object name to filter by
     wildcards : bool
         If true uses unix shell style pattern matching, otherwise uses regular expressions
-    **kwargs
+    kwargs
         Other ALF parts to filter, including namespace, attribute, etc.
 
     Returns
@@ -446,7 +448,7 @@ def exists(alfpath, object, attributes=None, **kwargs) -> bool:
         Wanted attributes
     wildcards : bool
         If true uses unix shell style pattern matching, otherwise uses regular expressions
-    **kwargs
+    kwargs
         Other ALF parts to filter by
 
     Returns
@@ -496,7 +498,7 @@ def load_object(alfpath, object=None, short_keys=False, **kwargs):
         and timescale.
     wildcards : bool
         If true uses unix shell style pattern matching, otherwise uses regular expressions.
-    **kwargs
+    kwargs
         Other ALF parts to filter by.
 
     Returns
@@ -832,3 +834,85 @@ def _match(part, pattern, split=None):
                     break
 
     return alf_files, [tuple(attr.values()) for attr in attributes]
+
+
+def find_variants(file_list, namespace=True, timescale=True, extra=True, extension=True):
+    """
+    Find variant datasets.
+
+    Finds any datasets on disk that are considered a variant of the input datasets. At minimum, a
+    dataset is uniquely defined by session path, collection, object and attribute. Therefore,
+    datasets with the same name and collection in a different revision folder are considered a
+    variant. If any of the keyword arguments are set to False, those parts are ignored when
+    comparing datasets.
+
+    Parameters
+    ----------
+    file_list : list of str, list of pathlib.Path
+        A list of ALF paths to find variants of.
+    namespace : bool
+        If true, treat datasets with a different namespace as unique.
+    timescale : bool
+        If true, treat datasets with a different timescale as unique.
+    extra : bool
+        If true, treat datasets with a different extra parts as unique.
+    extension : bool
+        If true, treat datasets with a different extension as unique.
+
+    Returns
+    -------
+    Dict[pathlib.Path, list of pathlib.Path]
+        A map of input file paths to a list variant dataset paths.
+
+    Raises
+    ------
+    ValueError
+        One or more input file paths are not valid ALF datasets.
+
+    Examples
+    --------
+    Find all datasets with an identical name and collection in a different revision folder
+
+    >>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'])
+    {Path('/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'): [
+        Path('/sub/2020-10-01/001/alf/obj.attr.npy')
+    ]}
+
+    Find all datasets with different namespace or revision
+
+    >>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'], namespace=False)
+    {Path('/sub/2020-10-01/001/#2020-01-01#/obj.attr.npy'): [
+        Path('/sub/2020-10-01/001/#2020-01-01#/_ns_obj.attr.npy'),
+        Path('/sub/2020-10-01/001/obj.attr.npy'),
+    ]}
+
+    """
+    # Parse into individual ALF parts
+    to_parts_dict = partial(files.full_path_parts, as_dict=True)
+    uParts = map(to_parts_dict, file_list)
+    # Initialize map of unique files to their duplicates
+    duplicates = {}
+    # Determine which parts to filter
+    variables = locals()
+    filters = {'namespace', 'timescale', 'extra', 'extension'}
+    to_compare = ('lab', 'subject', 'date', 'number', 'collection', 'object', 'attribute',
+                  *(arg for arg in filters if variables[arg]))
+
+    def parts_match(parts, file):
+        """Compare a file's unique parts to a given file"""
+        other = to_parts_dict(file)
+        return all(parts[k] == other[k] for k in to_compare)
+
+    # iterate over unique files and their parts
+    for f, parts in zip(map(Path, file_list), uParts):
+        # first glob for files matching object.attribute (including revisions)
+        pattern = f'*{parts["object"]}.{parts["attribute"]}*'
+        # this works because revision will always be last folder;
+        # i.e. revisions can't contain collections
+        globbed = map(files.without_revision(f).parent.glob, (pattern, '#*#/' + pattern))
+        globbed = chain.from_iterable(globbed)  # unite revision and non-revision globs
+        # refine duplicates based on other parts (this also ensures we don't catch similar objects)
+        globbed = filter(partial(parts_match, parts), globbed)
+        # key = f.relative_to(one.alf.files.get_session_path(f)).as_posix()
+        duplicates[f] = [x for x in globbed if x != f]  # map file to list of its duplicates
+    return duplicates
diff --git a/one/registration.py b/one/registration.py
@@ -410,13 +410,13 @@ def prepare_files(self, file_list, versions=None):
         Returns
         -------
         list of dicts
-            A dict containing a list of files for each session
+            A dict containing a list of files for each session.
         list of dicts
-            A dict containg a list of versions for each session
+            A dict containing a list of versions for each session.
         list
-            A list of files converted to paths
+            A list of files converted to paths.
         bool
-            A boolean indicating if input was a single file
+            A boolean indicating if input was a single file.
         """
 
         F = defaultdict(list)  # empty map whose keys will be session paths

diff --git a/one/tests/alf/test_alf_files.py b/one/tests/alf/test_alf_files.py
@@ -160,6 +160,12 @@ def test_add_uuid(self):
             self.assertEqual(tup[1], files.add_uuid_string(tup[0], _uuid))
             self.assertEqual(tup[1], files.add_uuid_string(tup[0], str(_uuid)))
 
+        _uuid2 = uuid.uuid4()
+        with self.assertLogs(files.__name__, level=10) as cm:
+            expected = Path(f'/titi/tutu.part1.part1.{_uuid2}.json')
+            self.assertEqual(expected, files.add_uuid_string(file_with_uuid, _uuid2))
+            self.assertRegex(cm.output[0], 'Replacing [a-f0-9-]+ with [a-f0-9-]+')
+
         with self.assertRaises(ValueError):
             files.add_uuid_string('/foo/bar.npy', 'fake')
 
@@ -225,6 +231,18 @@ def test_get_alf_path(self):
         path = '/trials.intervals_bpod.npy'
         self.assertEqual(files.get_alf_path(path), 'trials.intervals_bpod.npy')
 
+    def test_without_revision(self):
+        """Test for one.alf.files.without_revision function."""
+        path = '/mnt/s0/Data/Subjects/ZM_1368/2019-04-19/001/alf/#2020-01-01#/obj.attr.ext'
+        out = files.without_revision(path)
+        expected = Path(path.replace('/#2020-01-01#', ''))
+        self.assertIsInstance(out, Path)
+        self.assertEqual(expected, out, 'failed to remove revision folder')
+        self.assertEqual(expected, files.without_revision(out))  # should do nothing to path
+        with self.assertRaises(ValueError) as cm:
+            files.without_revision('foo/bar/baz.npy')
+        self.assertRegex(str(cm.exception), 'Invalid ALF')
+
 
 if __name__ == "__main__":
     unittest.main(exit=False, verbosity=2)
diff --git a/one/tests/alf/test_alf_io.py b/one/tests/alf/test_alf_io.py
@@ -711,5 +711,51 @@ def test_iter_datasets(self):
         self.assertEqual([Path(*dset.parts[-2:])], ses_files)
 
 
+class TestFindVariants(unittest.TestCase):
+
+    def setUp(self):
+        tmp = tempfile.TemporaryDirectory()
+        self.tmp = Path(tmp.name)
+        self.addCleanup(tmp.cleanup)
+
+        # Create tree
+        self.session_path = self.tmp / 'subject' / '2020-01-01' / '001'
+        self.dsets = [
+            self.session_path.joinpath('_x_foo.bar.npy'),
+            self.session_path.joinpath('#2021-01-01#', 'foo.bar.npy'),
+            self.session_path.joinpath(f'bar.baz.{uuid.uuid4()}.npy'),
+            self.session_path.joinpath(f'bar.baz_y.{uuid.uuid4()}.npy'),
+            self.session_path.joinpath('#2021-01-01#', f'bar.baz.{uuid.uuid4()}.npy'),
+            self.session_path.joinpath('task_00', 'x.y.z'),
+            self.session_path.joinpath('x.y.z'),
+        ]
+        for f in self.dsets:
+            f.parent.mkdir(exist_ok=True, parents=True)
+            f.touch()
+
+    def test_unique(self):
+        """Test for one.alf.io.find_variants function."""
+        dupes = alfio.find_variants(self.dsets)
+        self.assertCountEqual(self.dsets, dupes.keys(), 'expected keys to match input files')
+        self.assertFalse(any(map(any, dupes.values())), 'expected no duplicates')
+
+        # With extra=False should treat files with extra parts as a variant
+        dupes = alfio.find_variants(self.dsets, extra=False)
+        # 'bar.baz.abc.npy' is a variant of '#revision#/bar.baz.def.npy' and vice versa
+        self.assertEqual(dupes[self.dsets[2]], [self.dsets[4]])
+        self.assertEqual(dupes[self.dsets[4]], [self.dsets[2]])
+        # Expect all other datasets to be considered unique
+        others = [v for k, v in dupes.items() if k not in (self.dsets[2], self.dsets[4])]
+        self.assertFalse(any(map(any, others)))
+
+        # Treat other file parts as variants
+        files = [self.dsets[0], self.dsets[2], self.dsets[-1]]
+        dupes = alfio.find_variants(files, namespace=False, timescale=False, extra=False)
+        expected_files = (self.dsets[1:2], self.dsets[3:5], [])  # expected variants for each file
+        for key, expected in zip(files, expected_files):
+            with self.subTest(key=key):
+                self.assertCountEqual(dupes[self.session_path.joinpath(key)], expected)
+
+
 if __name__ == '__main__':
     unittest.main(exit=False, verbosity=2)