Skip to content

Commit

Permalink
find_variants and without_revision alf functions
Browse files Browse the repository at this point in the history
  • Loading branch information
k1o0 committed Aug 14, 2024
1 parent c7961b4 commit 7236111
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 22 deletions.
14 changes: 13 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Changelog
## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.8.1]
## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.0]
This version adds a couple of new ALF functions.

### Added

- one.alf.io.find_variants allows one to find similar datasets on disk, such as revisions
- one.alf.files.without_revision returns a file path without the revision folder

### Modified

- one.alf.files.add_uuid_string will now replace a UUID in a filename if one already present.

## [2.8.1]

### Modified

Expand Down
2 changes: 1 addition & 1 deletion one/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""The Open Neurophysiology Environment (ONE) API."""
__version__ = '2.8.1'
__version__ = '2.9.0'
55 changes: 42 additions & 13 deletions one/alf/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,13 @@ def add_uuid_string(file_path, uuid):
if isinstance(file_path, str):
file_path = Path(file_path)
name_parts = file_path.stem.split('.')
if uuid == name_parts[-1]:
_logger.warning(f'UUID already found in file name: {file_path.name}: IGNORE')
return file_path
if spec.is_uuid(name_parts[-1]):
*name_parts, old_uuid = name_parts
if old_uuid == uuid:
_logger.warning(f'UUID already found in file name: {file_path.name}: IGNORE')
return file_path
else:
_logger.debug('Replacing %s with %s in %s', old_uuid, uuid, file_path)
return file_path.parent.joinpath(f"{'.'.join(name_parts)}.{uuid}{file_path.suffix}")


Expand Down Expand Up @@ -448,13 +452,13 @@ def remove_uuid_string(file_path):
return file_path


def padded_sequence(filepath):
def padded_sequence(file_path):
"""
Ensures a file path contains a zero-padded experiment sequence folder.
Parameters
----------
filepath : str, pathlib.Path, pathlib.PurePath
file_path : str, pathlib.Path, pathlib.PurePath
A session or file path to convert.
Returns
Expand All @@ -465,20 +469,45 @@ def padded_sequence(filepath):
Examples
--------
>>> filepath = '/iblrigdata/subject/2023-01-01/1/_ibl_experiment.description.yaml'
>>> padded_sequence(filepath)
>>> file_path = '/iblrigdata/subject/2023-01-01/1/_ibl_experiment.description.yaml'
>>> padded_sequence(file_path)
pathlib.Path('/iblrigdata/subject/2023-01-01/001/_ibl_experiment.description.yaml')
Supports folders and will not affect already padded paths
>>> session_path = pathlib.PurePosixPath('subject/2023-01-01/001')
>>> padded_sequence(filepath)
>>> padded_sequence(file_path)
pathlib.PurePosixPath('subject/2023-01-01/001')
"""
if isinstance(filepath, str):
filepath = Path(filepath)
if (session_path := get_session_path(filepath)) is None:
if isinstance(file_path, str):
file_path = Path(file_path)
if (session_path := get_session_path(file_path)) is None:
raise ValueError('path must include a valid ALF session path, e.g. subject/YYYY-MM-DD/N')
idx = len(filepath.parts) - len(session_path.parts)
idx = len(file_path.parts) - len(session_path.parts)
sequence = str(int(session_path.parts[-1])).zfill(3) # zero-pad if necessary
return filepath.parents[idx].joinpath(sequence, filepath.relative_to(session_path))
return file_path.parents[idx].joinpath(sequence, file_path.relative_to(session_path))


def without_revision(file_path):
"""
Return file path without a revision folder.
Parameters
----------
file_path : str, pathlib.Path
A valid ALF dataset path.
Returns
-------
pathlib.Path
The input file path without a revision folder.
Examples
--------
>>> without_revision('/lab/Subjects/subject/2023-01-01/001/collection/#revision#/obj.attr.ext')
Path('/lab/Subjects/subject/2023-01-01/001/collection/obj.attr.ext')
"""
if isinstance(file_path, str):
file_path = Path(file_path)
*_, collection, revision = folder_parts(file_path.parent)
return get_session_path(file_path).joinpath(*filter(None, (collection, file_path.name)))
90 changes: 87 additions & 3 deletions one/alf/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from fnmatch import fnmatch
from pathlib import Path
from typing import Union
from functools import partial
from itertools import chain
import warnings

import numpy as np
Expand Down Expand Up @@ -342,7 +344,7 @@ def _ls(alfpath, object=None, **kwargs) -> (list, tuple):
An ALF object name to filter by
wildcards : bool
If true uses unix shell style pattern matching, otherwise uses regular expressions
**kwargs
kwargs
Other ALF parts to filter, including namespace, attribute, etc.
Returns
Expand Down Expand Up @@ -446,7 +448,7 @@ def exists(alfpath, object, attributes=None, **kwargs) -> bool:
Wanted attributes
wildcards : bool
If true uses unix shell style pattern matching, otherwise uses regular expressions
**kwargs
kwargs
Other ALF parts to filter by
Returns
Expand Down Expand Up @@ -496,7 +498,7 @@ def load_object(alfpath, object=None, short_keys=False, **kwargs):
and timescale.
wildcards : bool
If true uses unix shell style pattern matching, otherwise uses regular expressions.
**kwargs
kwargs
Other ALF parts to filter by.
Returns
Expand Down Expand Up @@ -832,3 +834,85 @@ def _match(part, pattern, split=None):
break

return alf_files, [tuple(attr.values()) for attr in attributes]


def find_variants(file_list, namespace=True, timescale=True, extra=True, extension=True):
"""
Find variant datasets.
Finds any datasets on disk that are considered a variant of the input datasets. At minimum, a
dataset is uniquely defined by session path, collection, object and attribute. Therefore,
datasets with the same name and collection in a different revision folder are considered a
variant. If any of the keyword arguments are set to False, those parts are ignored when
comparing datasets.
Parameters
----------
file_list : list of str, list of pathlib.Path
A list of ALF paths to find variants of.
namespace : bool
If true, treat datasets with a different namespace as unique.
timescale : bool
If true, treat datasets with a different timescale as unique.
extra : bool
If true, treat datasets with a different extra parts as unique.
extension : bool
If true, treat datasets with a different extension as unique.
Returns
-------
Dict[pathlib.Path, list of pathlib.Path]
A map of input file paths to a list variant dataset paths.
Raises
------
ValueError
One or more input file paths are not valid ALF datasets.
Examples
--------
Find all datasets with an identical name and collection in a different revision folder
>>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'])
{Path('/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'): [
Path('/sub/2020-10-01/001/alf/obj.attr.npy')
]}
Find all datasets with different namespace or revision
>>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'], namespace=False)
{Path('/sub/2020-10-01/001/#2020-01-01#/obj.attr.npy'): [
Path('/sub/2020-10-01/001/#2020-01-01#/_ns_obj.attr.npy'),
Path('/sub/2020-10-01/001/obj.attr.npy'),
]}
"""
# Parse into individual ALF parts
to_parts_dict = partial(files.full_path_parts, as_dict=True)
uParts = map(to_parts_dict, file_list)
# Initialize map of unique files to their duplicates
duplicates = {}
# Determine which parts to filter
variables = locals()
filters = {'namespace', 'timescale', 'extra', 'extension'}
to_compare = ('lab', 'subject', 'date', 'number', 'collection', 'object', 'attribute',
*(arg for arg in filters if variables[arg]))

def parts_match(parts, file):
"""Compare a file's unique parts to a given file"""
other = to_parts_dict(file)
return all(parts[k] == other[k] for k in to_compare)

# iterate over unique files and their parts
for f, parts in zip(map(Path, file_list), uParts):
# first glob for files matching object.attribute (including revisions)
pattern = f'*{parts["object"]}.{parts["attribute"]}*'
# this works because revision will always be last folder;
# i.e. revisions can't contain collections
globbed = map(files.without_revision(f).parent.glob, (pattern, '#*#/' + pattern))
globbed = chain.from_iterable(globbed) # unite revision and non-revision globs
# refine duplicates based on other parts (this also ensures we don't catch similar objects)
globbed = filter(partial(parts_match, parts), globbed)
# key = f.relative_to(one.alf.files.get_session_path(f)).as_posix()
duplicates[f] = [x for x in globbed if x != f] # map file to list of its duplicates
return duplicates
8 changes: 4 additions & 4 deletions one/registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,13 +410,13 @@ def prepare_files(self, file_list, versions=None):
Returns
-------
list of dicts
A dict containing a list of files for each session
A dict containing a list of files for each session.
list of dicts
A dict containg a list of versions for each session
A dict containing a list of versions for each session.
list
A list of files converted to paths
A list of files converted to paths.
bool
A boolean indicating if input was a single file
A boolean indicating if input was a single file.
"""

F = defaultdict(list) # empty map whose keys will be session paths
Expand Down
18 changes: 18 additions & 0 deletions one/tests/alf/test_alf_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,12 @@ def test_add_uuid(self):
self.assertEqual(tup[1], files.add_uuid_string(tup[0], _uuid))
self.assertEqual(tup[1], files.add_uuid_string(tup[0], str(_uuid)))

_uuid2 = uuid.uuid4()
with self.assertLogs(files.__name__, level=10) as cm:
expected = Path(f'/titi/tutu.part1.part1.{_uuid2}.json')
self.assertEqual(expected, files.add_uuid_string(file_with_uuid, _uuid2))
self.assertRegex(cm.output[0], 'Replacing [a-f0-9-]+ with [a-f0-9-]+')

with self.assertRaises(ValueError):
files.add_uuid_string('/foo/bar.npy', 'fake')

Expand Down Expand Up @@ -225,6 +231,18 @@ def test_get_alf_path(self):
path = '/trials.intervals_bpod.npy'
self.assertEqual(files.get_alf_path(path), 'trials.intervals_bpod.npy')

def test_without_revision(self):
"""Test for one.alf.files.without_revision function."""
path = '/mnt/s0/Data/Subjects/ZM_1368/2019-04-19/001/alf/#2020-01-01#/obj.attr.ext'
out = files.without_revision(path)
expected = Path(path.replace('/#2020-01-01#', ''))
self.assertIsInstance(out, Path)
self.assertEqual(expected, out, 'failed to remove revision folder')
self.assertEqual(expected, files.without_revision(out)) # should do nothing to path
with self.assertRaises(ValueError) as cm:
files.without_revision('foo/bar/baz.npy')
self.assertRegex(str(cm.exception), 'Invalid ALF')


if __name__ == "__main__":
unittest.main(exit=False, verbosity=2)
46 changes: 46 additions & 0 deletions one/tests/alf/test_alf_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,5 +711,51 @@ def test_iter_datasets(self):
self.assertEqual([Path(*dset.parts[-2:])], ses_files)


class TestFindVariants(unittest.TestCase):

def setUp(self):
tmp = tempfile.TemporaryDirectory()
self.tmp = Path(tmp.name)
self.addCleanup(tmp.cleanup)

# Create tree
self.session_path = self.tmp / 'subject' / '2020-01-01' / '001'
self.dsets = [
self.session_path.joinpath('_x_foo.bar.npy'),
self.session_path.joinpath('#2021-01-01#', 'foo.bar.npy'),
self.session_path.joinpath(f'bar.baz.{uuid.uuid4()}.npy'),
self.session_path.joinpath(f'bar.baz_y.{uuid.uuid4()}.npy'),
self.session_path.joinpath('#2021-01-01#', f'bar.baz.{uuid.uuid4()}.npy'),
self.session_path.joinpath('task_00', 'x.y.z'),
self.session_path.joinpath('x.y.z'),
]
for f in self.dsets:
f.parent.mkdir(exist_ok=True, parents=True)
f.touch()

def test_unique(self):
"""Test for one.alf.io.find_variants function."""
dupes = alfio.find_variants(self.dsets)
self.assertCountEqual(self.dsets, dupes.keys(), 'expected keys to match input files')
self.assertFalse(any(map(any, dupes.values())), 'expected no duplicates')

# With extra=False should treat files with extra parts as a variant
dupes = alfio.find_variants(self.dsets, extra=False)
# 'bar.baz.abc.npy' is a variant of '#revision#/bar.baz.def.npy' and vice versa
self.assertEqual(dupes[self.dsets[2]], [self.dsets[4]])
self.assertEqual(dupes[self.dsets[4]], [self.dsets[2]])
# Expect all other datasets to be considered unique
others = [v for k, v in dupes.items() if k not in (self.dsets[2], self.dsets[4])]
self.assertFalse(any(map(any, others)))

# Treat other file parts as variants
files = [self.dsets[0], self.dsets[2], self.dsets[-1]]
dupes = alfio.find_variants(files, namespace=False, timescale=False, extra=False)
expected_files = (self.dsets[1:2], self.dsets[3:5], []) # expected variants for each file
for key, expected in zip(files, expected_files):
with self.subTest(key=key):
self.assertCountEqual(dupes[self.session_path.joinpath(key)], expected)


if __name__ == '__main__':
unittest.main(exit=False, verbosity=2)

0 comments on commit 7236111

Please sign in to comment.