From 357b5f079db4d60dec2333d536da244c51b0f0b2 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Wed, 25 Sep 2024 15:07:05 +0300 Subject: [PATCH] Resolves #131 and #130 --- CHANGELOG.md | 16 ++++- one/__init__.py | 2 +- one/alf/exceptions.py | 7 ++- one/tests/test_one.py | 54 +++++++++++----- one/util.py | 143 +++++++++++++++++++++++++++++++----------- 5 files changed, 166 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 228b9cbc..09a095ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog -## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.1] +## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.10.0] +This version improves behaviour of loading revisions and loading datasets from list_datasets output. + +### Modified + +- sub-collections no longer captured when filtering with filename that starts with wildcard in wildcard mode +- bugfix of spurious error raised when loading dataset with a revision provided + +### Added + +- one.alf.exceptions.ALFWarning category allows users to filter warnings relating to mixed revisions + +## [2.9.1] ### Modified @@ -7,7 +19,7 @@ - HOTFIX: Ensure http data server URL does not end in slash - HOTFIX: Handle public aggregate dataset relative paths - HOTFIX: No longer warns in silent mode when no param conflicts present -- Explicit kwargs in load_* methods to avoid user confusion (e.g. no 'namespace' kwarg for `load_dataset`) +- explicit kwargs in load_* methods to avoid user confusion (e.g. no 'namespace' kwarg for `load_dataset`) ## [2.9.0] This version adds a couple of new ALF functions. diff --git a/one/__init__.py b/one/__init__.py index fb65cac4..07a5d384 100644 --- a/one/__init__.py +++ b/one/__init__.py @@ -1,2 +1,2 @@ """The Open Neurophysiology Environment (ONE) API.""" -__version__ = '2.9.1' +__version__ = '2.10.0' diff --git a/one/alf/exceptions.py b/one/alf/exceptions.py index c5f72fca..d963767b 100644 --- a/one/alf/exceptions.py +++ b/one/alf/exceptions.py @@ -1,4 +1,4 @@ -"""ALyx File related errors. +"""ALyx File related errors and warnings. A set of Alyx and ALF related error classes which provide a more verbose description of the raised issues. @@ -82,3 +82,8 @@ class ALFMultipleRevisionsFound(ALFError): explanation = ('The matching object/file(s) belong to more than one revision. ' 'Multiple datasets in different revision folders were found with no default ' 'specified.') + + +class ALFWarning(Warning): + """Cautions when loading ALF datasets.""" + pass diff --git a/one/tests/test_one.py b/one/tests/test_one.py index e978a2ea..267f19ed 100644 --- a/one/tests/test_one.py +++ b/one/tests/test_one.py @@ -293,15 +293,22 @@ def test_filter(self): datasets['rel_path'] = revisions # Should return last revision before date for each collection/dataset + # These comprise mixed revisions which should trigger ALF warning revision = '2020-09-06' - verifiable = filter_datasets(datasets, None, None, revision, assert_unique=False) + expected_warn = 'Multiple revisions: "2020-08-31", "2020-01-01"' + with self.assertWarnsRegex(alferr.ALFWarning, expected_warn): + verifiable = filter_datasets(datasets, None, None, revision, assert_unique=False) self.assertEqual(2, len(verifiable)) self.assertTrue(all(x.split('#')[1] < revision for x in verifiable['rel_path'])) - # Should return single dataset with last revision when default specified - with self.assertRaises(alferr.ALFMultipleRevisionsFound): - filter_datasets(datasets, '*spikes.times*', 'alf/probe00', None, - assert_unique=True, wildcards=True, revision_last_before=True) + # with no default_revisions column there should be a warning about return latest revision + # when no revision is provided. + with self.assertWarnsRegex(alferr.ALFWarning, 'No default revision for dataset'): + verifiable = filter_datasets( + datasets, '*spikes.times*', 'alf/probe00', None, + assert_unique=True, wildcards=True, revision_last_before=True) + self.assertEqual(1, len(verifiable)) + self.assertTrue(verifiable['rel_path'].str.contains('#2021-xx-xx#').all()) # Should return matching revision verifiable = filter_datasets(datasets, None, None, r'2020-08-\d{2}', @@ -342,8 +349,8 @@ def test_filter(self): assert_unique=True, wildcards=True, revision_last_before=True) self.assertEqual(verifiable.rel_path.to_list(), ['alf/probe00/#2020-01-01#/spikes.times.npy']) - # When revision_last_before is false, expect multiple objects error - with self.assertRaises(alferr.ALFMultipleObjectsFound): + # When revision_last_before is false, expect multiple revisions error + with self.assertRaises(alferr.ALFMultipleRevisionsFound): filter_datasets(datasets, '*spikes.times*', 'alf/probe00', None, assert_unique=True, wildcards=True, revision_last_before=False) @@ -355,12 +362,29 @@ def test_filter_wildcards(self): assert_unique=False, wildcards=True) self.assertTrue(len(verifiable) == 2) # As dict with list (should act as logical OR) + kwargs = dict(assert_unique=False, revision_last_before=False, wildcards=True) dataset = dict(attribute=['timestamp?', 'raw']) - verifiable = filter_datasets(datasets, dataset, None, None, - assert_unique=False, revision_last_before=False, - wildcards=True) + verifiable = filter_datasets(datasets, dataset, None, None, **kwargs) self.assertEqual(4, len(verifiable)) + # Test handling greedy captures of collection parts when there are wildcards at the start + # of the filename patten. + + # Add some identical files that exist in collections and sub-collections + # (i.e. raw_ephys_data, raw_ephys_data/probe00, raw_ephys_data/probe01) + all_datasets = self.one._cache.datasets + meta_datasets = all_datasets[all_datasets.rel_path.str.contains('meta')].copy() + datasets = pd.concat([datasets, meta_datasets]) + + # Matching *meta should not capture raw_ephys_data/probe00, etc. + verifiable = filter_datasets(datasets, '*.meta', 'raw_ephys_data', None, **kwargs) + expected = ['raw_ephys_data/_spikeglx_ephysData_g0_t0.nidq.meta'] + self.assertCountEqual(expected, verifiable.rel_path) + verifiable = filter_datasets(datasets, '*.meta', 'raw_ephys_data/probe??', None, **kwargs) + self.assertEqual(2, len(verifiable)) + verifiable = filter_datasets(datasets, '*.meta', 'raw_ephys_data*', None, **kwargs) + self.assertEqual(3, len(verifiable)) + def test_list_datasets(self): """Test One.list_datasets""" # test filename @@ -1974,9 +1998,10 @@ def test_revision_last_before(self): revision='2020-09-01', assert_unique=False) self.assertTrue(len(verifiable) == 2) - # Test assert unique + # Remove one of the datasets' revisions to test assert unique on mixed revisions + df_mixed = df.drop((df['revision'] == '2020-01-08').idxmax()) with self.assertRaises(alferr.ALFMultipleRevisionsFound): - filter_revision_last_before(df, revision='2020-09-01', assert_unique=True) + filter_revision_last_before(df_mixed, revision='2020-09-01', assert_consistent=True) # Test with default revisions df['default_revision'] = False @@ -1991,8 +2016,9 @@ def test_revision_last_before(self): # Add unique default revisions df.iloc[[0, 4], -1] = True - verifiable = filter_revision_last_before(df.copy(), assert_unique=True) - self.assertTrue(len(verifiable) == 2) + with self.assertLogs(logging.getLogger('one.util'), 10): # should log mixed revisions + verifiable = filter_revision_last_before(df.copy(), assert_unique=True) + self.assertEqual(2, len(verifiable)) self.assertCountEqual(verifiable['rel_path'], df['rel_path'].iloc[[0, 4]]) # Add multiple default revisions diff --git a/one/util.py b/one/util.py index e7602def..da8359c5 100644 --- a/one/util.py +++ b/one/util.py @@ -1,10 +1,12 @@ """Decorators and small standalone functions for api module.""" +import re import logging import urllib.parse -from functools import wraps +import fnmatch +import warnings +from functools import wraps, partial from typing import Sequence, Union, Iterable, Optional, List from collections.abc import Mapping -import fnmatch from datetime import datetime import pandas as pd @@ -307,7 +309,9 @@ def filter_datasets( revision_last_before : bool When true and no exact match exists, the (lexicographically) previous revision is used instead. When false the revision string is matched like collection and filename, - with regular expressions permitted. + with regular expressions permitted. NB: When true and `revision` is None the default + revision is returned which may not be the last revision. If no default is defined, the + last revision is returned. qc : str, int, one.alf.spec.QC Returns datasets at or below this QC level. Integer values should correspond to the QC enumeration NOT the qc category column codes in the pandas table. @@ -350,6 +354,26 @@ def filter_datasets( >>> datasets filter_datasets(all_datasets, qc='PASS', ignore_qc_not_set=True) + Raises + ------ + one.alf.exceptions.ALFMultipleCollectionsFound + The matching list of datasets have more than one unique collection and `assert_unique` is + True. + one.alf.exceptions.ALFMultipleRevisionsFound + When `revision_last_before` is false, the matching list of datasets have more than one + unique revision. When `revision_last_before` is true, a 'default_revision' column exists, + and no revision is passed, this error means that one or more matching datasets have + multiple revisions specified as the default. This is typically an error in the cache table + itself as all datasets should have one and only one default revision specified. + one.alf.exceptions.ALFMultipleObjectsFound + The matching list of datasets have more than one unique filename and both `assert_unique` + and `revision_last_before` are true. + one.alf.exceptions.ALFError + When both `assert_unique` and `revision_last_before` is true, and a 'default_revision' + column exists but `revision` is None; one or more matching datasets have no default + revision specified. This is typically an error in the cache table itself as all datasets + should have one and only one default revision specified. + Notes ----- - It is not possible to match datasets that are in a given collection OR NOT in ANY collection. @@ -365,9 +389,15 @@ def filter_datasets( spec_str += _file_spec(**filename) regex_args.update(**filename) else: - # Convert to regex is necessary and assert end of string - filename = [fnmatch.translate(x) if wildcards else x + '$' for x in ensure_list(filename)] - spec_str += '|'.join(filename) + # Convert to regex if necessary and assert end of string + flagless_token = re.escape(r'(?s:') # fnmatch.translate may wrap input in flagless group + # If there is a wildcard at the start of the filename we must exclude capture of slashes to + # avoid capture of collection part, e.g. * -> .* -> [^/]+ (one or more non-slash chars) + exclude_slash = partial(re.sub, fr'^({flagless_token})?\.[*?]', r'\g<1>[^/]+') + spec_str += '|'.join( + exclude_slash(fnmatch.translate(x)) if wildcards else x + '$' + for x in ensure_list(filename) + ) # If matching revision name, add to regex string if not revision_last_before: @@ -393,33 +423,33 @@ def filter_datasets( qc_match &= all_datasets['qc'].ne('NOT_SET') # Filter datasets on path and QC - match = all_datasets[path_match & qc_match] + match = all_datasets[path_match & qc_match].copy() if len(match) == 0 or not (revision_last_before or assert_unique): return match - revisions = [rel_path_parts(x)[1] or '' for x in match.rel_path.values] + # Extract revision to separate column + if 'revision' not in match.columns: + match['revision'] = match.rel_path.map(lambda x: rel_path_parts(x)[1] or '') if assert_unique: collections = set(rel_path_parts(x)[0] or '' for x in match.rel_path.values) if len(collections) > 1: _list = '"' + '", "'.join(collections) + '"' raise alferr.ALFMultipleCollectionsFound(_list) if not revision_last_before: - if filename and len(match) > 1: + if len(set(match['revision'])) > 1: + _list = '"' + '", "'.join(set(match['revision'])) + '"' + raise alferr.ALFMultipleRevisionsFound(_list) + if len(match) > 1: _list = '"' + '", "'.join(match['rel_path']) + '"' raise alferr.ALFMultipleObjectsFound(_list) - if len(set(revisions)) > 1: - _list = '"' + '", "'.join(set(revisions)) + '"' - raise alferr.ALFMultipleRevisionsFound(_list) else: return match - elif filename and len(set(revisions)) != len(revisions): - _list = '"' + '", "'.join(match['rel_path']) + '"' - raise alferr.ALFMultipleObjectsFound(_list) return filter_revision_last_before(match, revision, assert_unique=assert_unique) -def filter_revision_last_before(datasets, revision=None, assert_unique=True): +def filter_revision_last_before( + datasets, revision=None, assert_unique=True, assert_consistent=False): """ Filter datasets by revision, returning previous revision in ordered list if revision doesn't exactly match. @@ -433,43 +463,80 @@ def filter_revision_last_before(datasets, revision=None, assert_unique=True): assert_unique : bool When true an alferr.ALFMultipleRevisionsFound exception is raised when multiple default revisions are found; an alferr.ALFError when no default revision is found. + assert_consistent : bool + Will raise alferr.ALFMultipleRevisionsFound if matching revision is different between + datasets. Returns ------- pd.DataFrame A datasets DataFrame with 0 or 1 row per unique dataset. + + Raises + ------ + one.alf.exceptions.ALFMultipleRevisionsFound + When the 'default_revision' column exists and no revision is passed, this error means that + one or more matching datasets have multiple revisions specified as the default. This is + typically an error in the cache table itself as all datasets should have one and only one + default revision specified. + When `assert_consistent` is True, this error may mean that the matching datasets have + mixed revisions. + one.alf.exceptions.ALFMultipleObjectsFound + The matching list of datasets have more than one unique filename and both `assert_unique` + and `revision_last_before` are true. + one.alf.exceptions.ALFError + When both `assert_unique` and `revision_last_before` is true, and a 'default_revision' + column exists but `revision` is None; one or more matching datasets have no default + revision specified. This is typically an error in the cache table itself as all datasets + should have one and only one default revision specified. + + Notes + ----- + - When `revision` is not None, the default revision value is not used. If an older revision is + the default one (uncommon), passing in a revision may lead to a newer revision being returned + than if revision is None. + - A view is returned if a revision column is present, otherwise a copy is returned. """ def _last_before(df): """Takes a DataFrame with only one dataset and multiple revisions, returns matching row""" - if revision is None and 'default_revision' in df.columns: - if assert_unique and sum(df.default_revision) > 1: - revisions = df['revision'][df.default_revision.values] - rev_list = '"' + '", "'.join(revisions) + '"' - raise alferr.ALFMultipleRevisionsFound(rev_list) - if sum(df.default_revision) == 1: - return df[df.default_revision] - if len(df) == 1: # This may be the case when called from load_datasets - return df # It's not the default be there's only one available revision - # default_revision column all False; default isn't copied to remote repository + if revision is None: dset_name = df['rel_path'].iloc[0] - if assert_unique: - raise alferr.ALFError(f'No default revision for dataset {dset_name}') - else: - logger.warning(f'No default revision for dataset {dset_name}; using most recent') + if 'default_revision' in df.columns: + if assert_unique and sum(df.default_revision) > 1: + revisions = df['revision'][df.default_revision.values] + rev_list = '"' + '", "'.join(revisions) + '"' + raise alferr.ALFMultipleRevisionsFound(rev_list) + if sum(df.default_revision) == 1: + return df[df.default_revision] + if len(df) == 1: # This may be the case when called from load_datasets + return df # It's not the default but there's only one available revision + # default_revision column all False; default isn't copied to remote repository + if assert_unique: + raise alferr.ALFError(f'No default revision for dataset {dset_name}') + warnings.warn( + f'No default revision for dataset {dset_name}; using most recent', + alferr.ALFWarning) # Compare revisions lexicographically - if assert_unique and len(df['revision'].unique()) > 1: - rev_list = '"' + '", "'.join(df['revision'].unique()) + '"' - raise alferr.ALFMultipleRevisionsFound(rev_list) - # Square brackets forces 1 row DataFrame returned instead of Series idx = index_last_before(df['revision'].tolist(), revision) - # return df.iloc[slice(0, 0) if idx is None else [idx], :] + # Square brackets forces 1 row DataFrame returned instead of Series return df.iloc[slice(0, 0) if idx is None else [idx], :] - with pd.option_context('mode.chained_assignment', None): # FIXME Explicitly copy? - datasets['revision'] = [rel_path_parts(x)[1] or '' for x in datasets.rel_path] + # Extract revision to separate column + if 'revision' not in datasets.columns: + with pd.option_context('mode.chained_assignment', None): # FIXME Explicitly copy? + datasets['revision'] = datasets.rel_path.map(lambda x: rel_path_parts(x)[1] or '') + # Group by relative path (sans revision) groups = datasets.rel_path.str.replace('#.*#/', '', regex=True).values grouped = datasets.groupby(groups, group_keys=False) - return grouped.apply(_last_before) + filtered = grouped.apply(_last_before) + # Raise if matching revision is different between datasets + if len(filtered['revision'].unique()) > 1: + rev_list = '"' + '", "'.join(filtered['revision'].unique()) + '"' + if assert_consistent: + raise alferr.ALFMultipleRevisionsFound(rev_list) + else: + warnings.warn(f'Multiple revisions: {rev_list}', alferr.ALFWarning) + return filtered def index_last_before(revisions: List[str], revision: Optional[str]) -> Optional[int]: