From 357b5f079db4d60dec2333d536da244c51b0f0b2 Mon Sep 17 00:00:00 2001
From: Miles Wells <k1o0@5tk.co>
Date: Wed, 25 Sep 2024 15:07:05 +0300
Subject: [PATCH] Resolves #131 and #130

---
 CHANGELOG.md          |  16 ++++-
 one/__init__.py       |   2 +-
 one/alf/exceptions.py |   7 ++-
 one/tests/test_one.py |  54 +++++++++++-----
 one/util.py           | 143 +++++++++++++++++++++++++++++++-----------
 5 files changed, 166 insertions(+), 56 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 228b9cbc..09a095ad 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,17 @@
 # Changelog
-## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.1]
+## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.10.0]
+This version improves behaviour of loading revisions and loading datasets from list_datasets output. 
+
+### Modified
+
+- sub-collections no longer captured when filtering with filename that starts with wildcard in wildcard mode
+- bugfix of spurious error raised when loading dataset with a revision provided
+
+### Added
+
+- one.alf.exceptions.ALFWarning category allows users to filter warnings relating to mixed revisions
+
+## [2.9.1]
 
 ### Modified
 
@@ -7,7 +19,7 @@
 - HOTFIX: Ensure http data server URL does not end in slash
 - HOTFIX: Handle public aggregate dataset relative paths
 - HOTFIX: No longer warns in silent mode when no param conflicts present
-- Explicit kwargs in load_* methods to avoid user confusion (e.g. no 'namespace' kwarg for `load_dataset`)
+- explicit kwargs in load_* methods to avoid user confusion (e.g. no 'namespace' kwarg for `load_dataset`)
 
 ## [2.9.0]
 This version adds a couple of new ALF functions.
diff --git a/one/__init__.py b/one/__init__.py
index fb65cac4..07a5d384 100644
--- a/one/__init__.py
+++ b/one/__init__.py
@@ -1,2 +1,2 @@
 """The Open Neurophysiology Environment (ONE) API."""
-__version__ = '2.9.1'
+__version__ = '2.10.0'
diff --git a/one/alf/exceptions.py b/one/alf/exceptions.py
index c5f72fca..d963767b 100644
--- a/one/alf/exceptions.py
+++ b/one/alf/exceptions.py
@@ -1,4 +1,4 @@
-"""ALyx File related errors.
+"""ALyx File related errors and warnings.
 
 A set of Alyx and ALF related error classes which provide a more verbose description of the raised
 issues.
@@ -82,3 +82,8 @@ class ALFMultipleRevisionsFound(ALFError):
     explanation = ('The matching object/file(s) belong to more than one revision.  '
                    'Multiple datasets in different revision folders were found with no default '
                    'specified.')
+
+
+class ALFWarning(Warning):
+    """Cautions when loading ALF datasets."""
+    pass
diff --git a/one/tests/test_one.py b/one/tests/test_one.py
index e978a2ea..267f19ed 100644
--- a/one/tests/test_one.py
+++ b/one/tests/test_one.py
@@ -293,15 +293,22 @@ def test_filter(self):
         datasets['rel_path'] = revisions
 
         # Should return last revision before date for each collection/dataset
+        # These comprise mixed revisions which should trigger ALF warning
         revision = '2020-09-06'
-        verifiable = filter_datasets(datasets, None, None, revision, assert_unique=False)
+        expected_warn = 'Multiple revisions: "2020-08-31", "2020-01-01"'
+        with self.assertWarnsRegex(alferr.ALFWarning, expected_warn):
+            verifiable = filter_datasets(datasets, None, None, revision, assert_unique=False)
         self.assertEqual(2, len(verifiable))
         self.assertTrue(all(x.split('#')[1] < revision for x in verifiable['rel_path']))
 
-        # Should return single dataset with last revision when default specified
-        with self.assertRaises(alferr.ALFMultipleRevisionsFound):
-            filter_datasets(datasets, '*spikes.times*', 'alf/probe00', None,
-                            assert_unique=True, wildcards=True, revision_last_before=True)
+        # with no default_revisions column there should be a warning about return latest revision
+        # when no revision is provided.
+        with self.assertWarnsRegex(alferr.ALFWarning, 'No default revision for dataset'):
+            verifiable = filter_datasets(
+                datasets, '*spikes.times*', 'alf/probe00', None,
+                assert_unique=True, wildcards=True, revision_last_before=True)
+            self.assertEqual(1, len(verifiable))
+            self.assertTrue(verifiable['rel_path'].str.contains('#2021-xx-xx#').all())
 
         # Should return matching revision
         verifiable = filter_datasets(datasets, None, None, r'2020-08-\d{2}',
@@ -342,8 +349,8 @@ def test_filter(self):
                                      assert_unique=True, wildcards=True, revision_last_before=True)
         self.assertEqual(verifiable.rel_path.to_list(),
                          ['alf/probe00/#2020-01-01#/spikes.times.npy'])
-        # When revision_last_before is false, expect multiple objects error
-        with self.assertRaises(alferr.ALFMultipleObjectsFound):
+        # When revision_last_before is false, expect multiple revisions error
+        with self.assertRaises(alferr.ALFMultipleRevisionsFound):
             filter_datasets(datasets, '*spikes.times*', 'alf/probe00', None,
                             assert_unique=True, wildcards=True, revision_last_before=False)
 
@@ -355,12 +362,29 @@ def test_filter_wildcards(self):
                                      assert_unique=False, wildcards=True)
         self.assertTrue(len(verifiable) == 2)
         # As dict with list (should act as logical OR)
+        kwargs = dict(assert_unique=False, revision_last_before=False, wildcards=True)
         dataset = dict(attribute=['timestamp?', 'raw'])
-        verifiable = filter_datasets(datasets, dataset, None, None,
-                                     assert_unique=False, revision_last_before=False,
-                                     wildcards=True)
+        verifiable = filter_datasets(datasets, dataset, None, None, **kwargs)
         self.assertEqual(4, len(verifiable))
 
+        # Test handling greedy captures of collection parts when there are wildcards at the start
+        # of the filename patten.
+
+        # Add some identical files that exist in collections and sub-collections
+        # (i.e. raw_ephys_data, raw_ephys_data/probe00, raw_ephys_data/probe01)
+        all_datasets = self.one._cache.datasets
+        meta_datasets = all_datasets[all_datasets.rel_path.str.contains('meta')].copy()
+        datasets = pd.concat([datasets, meta_datasets])
+
+        # Matching *meta should not capture raw_ephys_data/probe00, etc.
+        verifiable = filter_datasets(datasets, '*.meta', 'raw_ephys_data', None, **kwargs)
+        expected = ['raw_ephys_data/_spikeglx_ephysData_g0_t0.nidq.meta']
+        self.assertCountEqual(expected, verifiable.rel_path)
+        verifiable = filter_datasets(datasets, '*.meta', 'raw_ephys_data/probe??', None, **kwargs)
+        self.assertEqual(2, len(verifiable))
+        verifiable = filter_datasets(datasets, '*.meta', 'raw_ephys_data*', None, **kwargs)
+        self.assertEqual(3, len(verifiable))
+
     def test_list_datasets(self):
         """Test One.list_datasets"""
         # test filename
@@ -1974,9 +1998,10 @@ def test_revision_last_before(self):
                                                  revision='2020-09-01', assert_unique=False)
         self.assertTrue(len(verifiable) == 2)
 
-        # Test assert unique
+        # Remove one of the datasets' revisions to test assert unique on mixed revisions
+        df_mixed = df.drop((df['revision'] == '2020-01-08').idxmax())
         with self.assertRaises(alferr.ALFMultipleRevisionsFound):
-            filter_revision_last_before(df, revision='2020-09-01', assert_unique=True)
+            filter_revision_last_before(df_mixed, revision='2020-09-01', assert_consistent=True)
 
         # Test with default revisions
         df['default_revision'] = False
@@ -1991,8 +2016,9 @@ def test_revision_last_before(self):
 
         # Add unique default revisions
         df.iloc[[0, 4], -1] = True
-        verifiable = filter_revision_last_before(df.copy(), assert_unique=True)
-        self.assertTrue(len(verifiable) == 2)
+        with self.assertLogs(logging.getLogger('one.util'), 10):  # should log mixed revisions
+            verifiable = filter_revision_last_before(df.copy(), assert_unique=True)
+        self.assertEqual(2, len(verifiable))
         self.assertCountEqual(verifiable['rel_path'], df['rel_path'].iloc[[0, 4]])
 
         # Add multiple default revisions
diff --git a/one/util.py b/one/util.py
index e7602def..da8359c5 100644
--- a/one/util.py
+++ b/one/util.py
@@ -1,10 +1,12 @@
 """Decorators and small standalone functions for api module."""
+import re
 import logging
 import urllib.parse
-from functools import wraps
+import fnmatch
+import warnings
+from functools import wraps, partial
 from typing import Sequence, Union, Iterable, Optional, List
 from collections.abc import Mapping
-import fnmatch
 from datetime import datetime
 
 import pandas as pd
@@ -307,7 +309,9 @@ def filter_datasets(
     revision_last_before : bool
         When true and no exact match exists, the (lexicographically) previous revision is used
         instead.  When false the revision string is matched like collection and filename,
-        with regular expressions permitted.
+        with regular expressions permitted.  NB: When true and `revision` is None the default
+        revision is returned which may not be the last revision. If no default is defined, the
+        last revision is returned.
     qc : str, int, one.alf.spec.QC
         Returns datasets at or below this QC level.  Integer values should correspond to the QC
         enumeration NOT the qc category column codes in the pandas table.
@@ -350,6 +354,26 @@ def filter_datasets(
 
     >>> datasets filter_datasets(all_datasets, qc='PASS', ignore_qc_not_set=True)
 
+    Raises
+    ------
+    one.alf.exceptions.ALFMultipleCollectionsFound
+        The matching list of datasets have more than one unique collection and `assert_unique` is
+        True.
+    one.alf.exceptions.ALFMultipleRevisionsFound
+        When `revision_last_before` is false, the matching list of datasets have more than one
+        unique revision.  When `revision_last_before` is true, a 'default_revision' column exists,
+        and no revision is passed, this error means that one or more matching datasets have
+        multiple revisions specified as the default. This is typically an error in the cache table
+        itself as all datasets should have one and only one default revision specified.
+    one.alf.exceptions.ALFMultipleObjectsFound
+        The matching list of datasets have more than one unique filename and both `assert_unique`
+        and `revision_last_before` are true.
+    one.alf.exceptions.ALFError
+        When both `assert_unique` and `revision_last_before` is true, and a 'default_revision'
+        column exists but `revision` is None; one or more matching datasets have no default
+        revision specified. This is typically an error in the cache table itself as all datasets
+        should have one and only one default revision specified.
+
     Notes
     -----
     - It is not possible to match datasets that are in a given collection OR NOT in ANY collection.
@@ -365,9 +389,15 @@ def filter_datasets(
         spec_str += _file_spec(**filename)
         regex_args.update(**filename)
     else:
-        # Convert to regex is necessary and assert end of string
-        filename = [fnmatch.translate(x) if wildcards else x + '$' for x in ensure_list(filename)]
-        spec_str += '|'.join(filename)
+        # Convert to regex if necessary and assert end of string
+        flagless_token = re.escape(r'(?s:')  # fnmatch.translate may wrap input in flagless group
+        # If there is a wildcard at the start of the filename we must exclude capture of slashes to
+        # avoid capture of collection part, e.g. * -> .* -> [^/]+ (one or more non-slash chars)
+        exclude_slash = partial(re.sub, fr'^({flagless_token})?\.[*?]', r'\g<1>[^/]+')
+        spec_str += '|'.join(
+            exclude_slash(fnmatch.translate(x)) if wildcards else x + '$'
+            for x in ensure_list(filename)
+        )
 
     # If matching revision name, add to regex string
     if not revision_last_before:
@@ -393,33 +423,33 @@ def filter_datasets(
         qc_match &= all_datasets['qc'].ne('NOT_SET')
 
     # Filter datasets on path and QC
-    match = all_datasets[path_match & qc_match]
+    match = all_datasets[path_match & qc_match].copy()
     if len(match) == 0 or not (revision_last_before or assert_unique):
         return match
 
-    revisions = [rel_path_parts(x)[1] or '' for x in match.rel_path.values]
+    # Extract revision to separate column
+    if 'revision' not in match.columns:
+        match['revision'] = match.rel_path.map(lambda x: rel_path_parts(x)[1] or '')
     if assert_unique:
         collections = set(rel_path_parts(x)[0] or '' for x in match.rel_path.values)
         if len(collections) > 1:
             _list = '"' + '", "'.join(collections) + '"'
             raise alferr.ALFMultipleCollectionsFound(_list)
         if not revision_last_before:
-            if filename and len(match) > 1:
+            if len(set(match['revision'])) > 1:
+                _list = '"' + '", "'.join(set(match['revision'])) + '"'
+                raise alferr.ALFMultipleRevisionsFound(_list)
+            if len(match) > 1:
                 _list = '"' + '", "'.join(match['rel_path']) + '"'
                 raise alferr.ALFMultipleObjectsFound(_list)
-            if len(set(revisions)) > 1:
-                _list = '"' + '", "'.join(set(revisions)) + '"'
-                raise alferr.ALFMultipleRevisionsFound(_list)
             else:
                 return match
-        elif filename and len(set(revisions)) != len(revisions):
-            _list = '"' + '", "'.join(match['rel_path']) + '"'
-            raise alferr.ALFMultipleObjectsFound(_list)
 
     return filter_revision_last_before(match, revision, assert_unique=assert_unique)
 
 
-def filter_revision_last_before(datasets, revision=None, assert_unique=True):
+def filter_revision_last_before(
+        datasets, revision=None, assert_unique=True, assert_consistent=False):
     """
     Filter datasets by revision, returning previous revision in ordered list if revision
     doesn't exactly match.
@@ -433,43 +463,80 @@ def filter_revision_last_before(datasets, revision=None, assert_unique=True):
     assert_unique : bool
         When true an alferr.ALFMultipleRevisionsFound exception is raised when multiple
         default revisions are found; an alferr.ALFError when no default revision is found.
+    assert_consistent : bool
+        Will raise alferr.ALFMultipleRevisionsFound if matching revision is different between
+        datasets.
 
     Returns
     -------
     pd.DataFrame
         A datasets DataFrame with 0 or 1 row per unique dataset.
+
+    Raises
+    ------
+    one.alf.exceptions.ALFMultipleRevisionsFound
+        When the 'default_revision' column exists and no revision is passed, this error means that
+        one or more matching datasets have multiple revisions specified as the default. This is
+        typically an error in the cache table itself as all datasets should have one and only one
+        default revision specified.
+        When `assert_consistent` is True, this error may mean that the matching datasets have
+        mixed revisions.
+    one.alf.exceptions.ALFMultipleObjectsFound
+        The matching list of datasets have more than one unique filename and both `assert_unique`
+        and `revision_last_before` are true.
+    one.alf.exceptions.ALFError
+        When both `assert_unique` and `revision_last_before` is true, and a 'default_revision'
+        column exists but `revision` is None; one or more matching datasets have no default
+        revision specified. This is typically an error in the cache table itself as all datasets
+        should have one and only one default revision specified.
+
+    Notes
+    -----
+    - When `revision` is not None, the default revision value is not used. If an older revision is
+      the default one (uncommon), passing in a revision may lead to a newer revision being returned
+      than if revision is None.
+    - A view is returned if a revision column is present, otherwise a copy is returned.
     """
     def _last_before(df):
         """Takes a DataFrame with only one dataset and multiple revisions, returns matching row"""
-        if revision is None and 'default_revision' in df.columns:
-            if assert_unique and sum(df.default_revision) > 1:
-                revisions = df['revision'][df.default_revision.values]
-                rev_list = '"' + '", "'.join(revisions) + '"'
-                raise alferr.ALFMultipleRevisionsFound(rev_list)
-            if sum(df.default_revision) == 1:
-                return df[df.default_revision]
-            if len(df) == 1:  # This may be the case when called from load_datasets
-                return df  # It's not the default be there's only one available revision
-            # default_revision column all False; default isn't copied to remote repository
+        if revision is None:
             dset_name = df['rel_path'].iloc[0]
-            if assert_unique:
-                raise alferr.ALFError(f'No default revision for dataset {dset_name}')
-            else:
-                logger.warning(f'No default revision for dataset {dset_name}; using most recent')
+            if 'default_revision' in df.columns:
+                if assert_unique and sum(df.default_revision) > 1:
+                    revisions = df['revision'][df.default_revision.values]
+                    rev_list = '"' + '", "'.join(revisions) + '"'
+                    raise alferr.ALFMultipleRevisionsFound(rev_list)
+                if sum(df.default_revision) == 1:
+                    return df[df.default_revision]
+                if len(df) == 1:  # This may be the case when called from load_datasets
+                    return df  # It's not the default but there's only one available revision
+                # default_revision column all False; default isn't copied to remote repository
+                if assert_unique:
+                    raise alferr.ALFError(f'No default revision for dataset {dset_name}')
+            warnings.warn(
+                f'No default revision for dataset {dset_name}; using most recent',
+                alferr.ALFWarning)
         # Compare revisions lexicographically
-        if assert_unique and len(df['revision'].unique()) > 1:
-            rev_list = '"' + '", "'.join(df['revision'].unique()) + '"'
-            raise alferr.ALFMultipleRevisionsFound(rev_list)
-        # Square brackets forces 1 row DataFrame returned instead of Series
         idx = index_last_before(df['revision'].tolist(), revision)
-        # return df.iloc[slice(0, 0) if idx is None else [idx], :]
+        # Square brackets forces 1 row DataFrame returned instead of Series
         return df.iloc[slice(0, 0) if idx is None else [idx], :]
 
-    with pd.option_context('mode.chained_assignment', None):  # FIXME Explicitly copy?
-        datasets['revision'] = [rel_path_parts(x)[1] or '' for x in datasets.rel_path]
+    # Extract revision to separate column
+    if 'revision' not in datasets.columns:
+        with pd.option_context('mode.chained_assignment', None):  # FIXME Explicitly copy?
+            datasets['revision'] = datasets.rel_path.map(lambda x: rel_path_parts(x)[1] or '')
+    # Group by relative path (sans revision)
     groups = datasets.rel_path.str.replace('#.*#/', '', regex=True).values
     grouped = datasets.groupby(groups, group_keys=False)
-    return grouped.apply(_last_before)
+    filtered = grouped.apply(_last_before)
+    # Raise if matching revision is different between datasets
+    if len(filtered['revision'].unique()) > 1:
+        rev_list = '"' + '", "'.join(filtered['revision'].unique()) + '"'
+        if assert_consistent:
+            raise alferr.ALFMultipleRevisionsFound(rev_list)
+        else:
+            warnings.warn(f'Multiple revisions: {rev_list}', alferr.ALFWarning)
+    return filtered
 
 
 def index_last_before(revisions: List[str], revision: Optional[str]) -> Optional[int]: