Skip to content

Commit

Permalink
Resolves #131 and #130
Browse files Browse the repository at this point in the history
  • Loading branch information
k1o0 committed Sep 26, 2024
1 parent 26bea63 commit 357b5f0
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 56 deletions.
16 changes: 14 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
# Changelog
## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.1]
## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.10.0]
This version improves behaviour of loading revisions and loading datasets from list_datasets output.

### Modified

- sub-collections no longer captured when filtering with filename that starts with wildcard in wildcard mode
- bugfix of spurious error raised when loading dataset with a revision provided

### Added

- one.alf.exceptions.ALFWarning category allows users to filter warnings relating to mixed revisions

## [2.9.1]

### Modified

- HOTFIX: When downloading cache only authenticate Alyx when necessary
- HOTFIX: Ensure http data server URL does not end in slash
- HOTFIX: Handle public aggregate dataset relative paths
- HOTFIX: No longer warns in silent mode when no param conflicts present
- Explicit kwargs in load_* methods to avoid user confusion (e.g. no 'namespace' kwarg for `load_dataset`)
- explicit kwargs in load_* methods to avoid user confusion (e.g. no 'namespace' kwarg for `load_dataset`)

## [2.9.0]
This version adds a couple of new ALF functions.
Expand Down
2 changes: 1 addition & 1 deletion one/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""The Open Neurophysiology Environment (ONE) API."""
__version__ = '2.9.1'
__version__ = '2.10.0'
7 changes: 6 additions & 1 deletion one/alf/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""ALyx File related errors.
"""ALyx File related errors and warnings.
A set of Alyx and ALF related error classes which provide a more verbose description of the raised
issues.
Expand Down Expand Up @@ -82,3 +82,8 @@ class ALFMultipleRevisionsFound(ALFError):
explanation = ('The matching object/file(s) belong to more than one revision. '
'Multiple datasets in different revision folders were found with no default '
'specified.')


class ALFWarning(Warning):
"""Cautions when loading ALF datasets."""
pass
54 changes: 40 additions & 14 deletions one/tests/test_one.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,15 +293,22 @@ def test_filter(self):
datasets['rel_path'] = revisions

# Should return last revision before date for each collection/dataset
# These comprise mixed revisions which should trigger ALF warning
revision = '2020-09-06'
verifiable = filter_datasets(datasets, None, None, revision, assert_unique=False)
expected_warn = 'Multiple revisions: "2020-08-31", "2020-01-01"'
with self.assertWarnsRegex(alferr.ALFWarning, expected_warn):
verifiable = filter_datasets(datasets, None, None, revision, assert_unique=False)
self.assertEqual(2, len(verifiable))
self.assertTrue(all(x.split('#')[1] < revision for x in verifiable['rel_path']))

# Should return single dataset with last revision when default specified
with self.assertRaises(alferr.ALFMultipleRevisionsFound):
filter_datasets(datasets, '*spikes.times*', 'alf/probe00', None,
assert_unique=True, wildcards=True, revision_last_before=True)
# with no default_revisions column there should be a warning about return latest revision
# when no revision is provided.
with self.assertWarnsRegex(alferr.ALFWarning, 'No default revision for dataset'):
verifiable = filter_datasets(
datasets, '*spikes.times*', 'alf/probe00', None,
assert_unique=True, wildcards=True, revision_last_before=True)
self.assertEqual(1, len(verifiable))
self.assertTrue(verifiable['rel_path'].str.contains('#2021-xx-xx#').all())

# Should return matching revision
verifiable = filter_datasets(datasets, None, None, r'2020-08-\d{2}',
Expand Down Expand Up @@ -342,8 +349,8 @@ def test_filter(self):
assert_unique=True, wildcards=True, revision_last_before=True)
self.assertEqual(verifiable.rel_path.to_list(),
['alf/probe00/#2020-01-01#/spikes.times.npy'])
# When revision_last_before is false, expect multiple objects error
with self.assertRaises(alferr.ALFMultipleObjectsFound):
# When revision_last_before is false, expect multiple revisions error
with self.assertRaises(alferr.ALFMultipleRevisionsFound):
filter_datasets(datasets, '*spikes.times*', 'alf/probe00', None,
assert_unique=True, wildcards=True, revision_last_before=False)

Expand All @@ -355,12 +362,29 @@ def test_filter_wildcards(self):
assert_unique=False, wildcards=True)
self.assertTrue(len(verifiable) == 2)
# As dict with list (should act as logical OR)
kwargs = dict(assert_unique=False, revision_last_before=False, wildcards=True)
dataset = dict(attribute=['timestamp?', 'raw'])
verifiable = filter_datasets(datasets, dataset, None, None,
assert_unique=False, revision_last_before=False,
wildcards=True)
verifiable = filter_datasets(datasets, dataset, None, None, **kwargs)
self.assertEqual(4, len(verifiable))

# Test handling greedy captures of collection parts when there are wildcards at the start
# of the filename patten.

# Add some identical files that exist in collections and sub-collections
# (i.e. raw_ephys_data, raw_ephys_data/probe00, raw_ephys_data/probe01)
all_datasets = self.one._cache.datasets
meta_datasets = all_datasets[all_datasets.rel_path.str.contains('meta')].copy()
datasets = pd.concat([datasets, meta_datasets])

# Matching *meta should not capture raw_ephys_data/probe00, etc.
verifiable = filter_datasets(datasets, '*.meta', 'raw_ephys_data', None, **kwargs)
expected = ['raw_ephys_data/_spikeglx_ephysData_g0_t0.nidq.meta']
self.assertCountEqual(expected, verifiable.rel_path)
verifiable = filter_datasets(datasets, '*.meta', 'raw_ephys_data/probe??', None, **kwargs)
self.assertEqual(2, len(verifiable))
verifiable = filter_datasets(datasets, '*.meta', 'raw_ephys_data*', None, **kwargs)
self.assertEqual(3, len(verifiable))

def test_list_datasets(self):
"""Test One.list_datasets"""
# test filename
Expand Down Expand Up @@ -1974,9 +1998,10 @@ def test_revision_last_before(self):
revision='2020-09-01', assert_unique=False)
self.assertTrue(len(verifiable) == 2)

# Test assert unique
# Remove one of the datasets' revisions to test assert unique on mixed revisions
df_mixed = df.drop((df['revision'] == '2020-01-08').idxmax())
with self.assertRaises(alferr.ALFMultipleRevisionsFound):
filter_revision_last_before(df, revision='2020-09-01', assert_unique=True)
filter_revision_last_before(df_mixed, revision='2020-09-01', assert_consistent=True)

# Test with default revisions
df['default_revision'] = False
Expand All @@ -1991,8 +2016,9 @@ def test_revision_last_before(self):

# Add unique default revisions
df.iloc[[0, 4], -1] = True
verifiable = filter_revision_last_before(df.copy(), assert_unique=True)
self.assertTrue(len(verifiable) == 2)
with self.assertLogs(logging.getLogger('one.util'), 10): # should log mixed revisions
verifiable = filter_revision_last_before(df.copy(), assert_unique=True)
self.assertEqual(2, len(verifiable))
self.assertCountEqual(verifiable['rel_path'], df['rel_path'].iloc[[0, 4]])

# Add multiple default revisions
Expand Down
143 changes: 105 additions & 38 deletions one/util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Decorators and small standalone functions for api module."""
import re
import logging
import urllib.parse
from functools import wraps
import fnmatch
import warnings
from functools import wraps, partial
from typing import Sequence, Union, Iterable, Optional, List
from collections.abc import Mapping
import fnmatch
from datetime import datetime

import pandas as pd
Expand Down Expand Up @@ -307,7 +309,9 @@ def filter_datasets(
revision_last_before : bool
When true and no exact match exists, the (lexicographically) previous revision is used
instead. When false the revision string is matched like collection and filename,
with regular expressions permitted.
with regular expressions permitted. NB: When true and `revision` is None the default
revision is returned which may not be the last revision. If no default is defined, the
last revision is returned.
qc : str, int, one.alf.spec.QC
Returns datasets at or below this QC level. Integer values should correspond to the QC
enumeration NOT the qc category column codes in the pandas table.
Expand Down Expand Up @@ -350,6 +354,26 @@ def filter_datasets(
>>> datasets filter_datasets(all_datasets, qc='PASS', ignore_qc_not_set=True)
Raises
------
one.alf.exceptions.ALFMultipleCollectionsFound
The matching list of datasets have more than one unique collection and `assert_unique` is
True.
one.alf.exceptions.ALFMultipleRevisionsFound
When `revision_last_before` is false, the matching list of datasets have more than one
unique revision. When `revision_last_before` is true, a 'default_revision' column exists,
and no revision is passed, this error means that one or more matching datasets have
multiple revisions specified as the default. This is typically an error in the cache table
itself as all datasets should have one and only one default revision specified.
one.alf.exceptions.ALFMultipleObjectsFound
The matching list of datasets have more than one unique filename and both `assert_unique`
and `revision_last_before` are true.
one.alf.exceptions.ALFError
When both `assert_unique` and `revision_last_before` is true, and a 'default_revision'
column exists but `revision` is None; one or more matching datasets have no default
revision specified. This is typically an error in the cache table itself as all datasets
should have one and only one default revision specified.
Notes
-----
- It is not possible to match datasets that are in a given collection OR NOT in ANY collection.
Expand All @@ -365,9 +389,15 @@ def filter_datasets(
spec_str += _file_spec(**filename)
regex_args.update(**filename)
else:
# Convert to regex is necessary and assert end of string
filename = [fnmatch.translate(x) if wildcards else x + '$' for x in ensure_list(filename)]
spec_str += '|'.join(filename)
# Convert to regex if necessary and assert end of string
flagless_token = re.escape(r'(?s:') # fnmatch.translate may wrap input in flagless group
# If there is a wildcard at the start of the filename we must exclude capture of slashes to
# avoid capture of collection part, e.g. * -> .* -> [^/]+ (one or more non-slash chars)
exclude_slash = partial(re.sub, fr'^({flagless_token})?\.[*?]', r'\g<1>[^/]+')
spec_str += '|'.join(
exclude_slash(fnmatch.translate(x)) if wildcards else x + '$'
for x in ensure_list(filename)
)

# If matching revision name, add to regex string
if not revision_last_before:
Expand All @@ -393,33 +423,33 @@ def filter_datasets(
qc_match &= all_datasets['qc'].ne('NOT_SET')

# Filter datasets on path and QC
match = all_datasets[path_match & qc_match]
match = all_datasets[path_match & qc_match].copy()
if len(match) == 0 or not (revision_last_before or assert_unique):
return match

revisions = [rel_path_parts(x)[1] or '' for x in match.rel_path.values]
# Extract revision to separate column
if 'revision' not in match.columns:
match['revision'] = match.rel_path.map(lambda x: rel_path_parts(x)[1] or '')
if assert_unique:
collections = set(rel_path_parts(x)[0] or '' for x in match.rel_path.values)
if len(collections) > 1:
_list = '"' + '", "'.join(collections) + '"'
raise alferr.ALFMultipleCollectionsFound(_list)
if not revision_last_before:
if filename and len(match) > 1:
if len(set(match['revision'])) > 1:
_list = '"' + '", "'.join(set(match['revision'])) + '"'
raise alferr.ALFMultipleRevisionsFound(_list)
if len(match) > 1:
_list = '"' + '", "'.join(match['rel_path']) + '"'
raise alferr.ALFMultipleObjectsFound(_list)
if len(set(revisions)) > 1:
_list = '"' + '", "'.join(set(revisions)) + '"'
raise alferr.ALFMultipleRevisionsFound(_list)
else:
return match
elif filename and len(set(revisions)) != len(revisions):
_list = '"' + '", "'.join(match['rel_path']) + '"'
raise alferr.ALFMultipleObjectsFound(_list)

return filter_revision_last_before(match, revision, assert_unique=assert_unique)


def filter_revision_last_before(datasets, revision=None, assert_unique=True):
def filter_revision_last_before(
datasets, revision=None, assert_unique=True, assert_consistent=False):
"""
Filter datasets by revision, returning previous revision in ordered list if revision
doesn't exactly match.
Expand All @@ -433,43 +463,80 @@ def filter_revision_last_before(datasets, revision=None, assert_unique=True):
assert_unique : bool
When true an alferr.ALFMultipleRevisionsFound exception is raised when multiple
default revisions are found; an alferr.ALFError when no default revision is found.
assert_consistent : bool
Will raise alferr.ALFMultipleRevisionsFound if matching revision is different between
datasets.
Returns
-------
pd.DataFrame
A datasets DataFrame with 0 or 1 row per unique dataset.
Raises
------
one.alf.exceptions.ALFMultipleRevisionsFound
When the 'default_revision' column exists and no revision is passed, this error means that
one or more matching datasets have multiple revisions specified as the default. This is
typically an error in the cache table itself as all datasets should have one and only one
default revision specified.
When `assert_consistent` is True, this error may mean that the matching datasets have
mixed revisions.
one.alf.exceptions.ALFMultipleObjectsFound
The matching list of datasets have more than one unique filename and both `assert_unique`
and `revision_last_before` are true.
one.alf.exceptions.ALFError
When both `assert_unique` and `revision_last_before` is true, and a 'default_revision'
column exists but `revision` is None; one or more matching datasets have no default
revision specified. This is typically an error in the cache table itself as all datasets
should have one and only one default revision specified.
Notes
-----
- When `revision` is not None, the default revision value is not used. If an older revision is
the default one (uncommon), passing in a revision may lead to a newer revision being returned
than if revision is None.
- A view is returned if a revision column is present, otherwise a copy is returned.
"""
def _last_before(df):
"""Takes a DataFrame with only one dataset and multiple revisions, returns matching row"""
if revision is None and 'default_revision' in df.columns:
if assert_unique and sum(df.default_revision) > 1:
revisions = df['revision'][df.default_revision.values]
rev_list = '"' + '", "'.join(revisions) + '"'
raise alferr.ALFMultipleRevisionsFound(rev_list)
if sum(df.default_revision) == 1:
return df[df.default_revision]
if len(df) == 1: # This may be the case when called from load_datasets
return df # It's not the default be there's only one available revision
# default_revision column all False; default isn't copied to remote repository
if revision is None:
dset_name = df['rel_path'].iloc[0]
if assert_unique:
raise alferr.ALFError(f'No default revision for dataset {dset_name}')
else:
logger.warning(f'No default revision for dataset {dset_name}; using most recent')
if 'default_revision' in df.columns:
if assert_unique and sum(df.default_revision) > 1:
revisions = df['revision'][df.default_revision.values]
rev_list = '"' + '", "'.join(revisions) + '"'
raise alferr.ALFMultipleRevisionsFound(rev_list)
if sum(df.default_revision) == 1:
return df[df.default_revision]
if len(df) == 1: # This may be the case when called from load_datasets
return df # It's not the default but there's only one available revision
# default_revision column all False; default isn't copied to remote repository
if assert_unique:
raise alferr.ALFError(f'No default revision for dataset {dset_name}')
warnings.warn(
f'No default revision for dataset {dset_name}; using most recent',
alferr.ALFWarning)
# Compare revisions lexicographically
if assert_unique and len(df['revision'].unique()) > 1:
rev_list = '"' + '", "'.join(df['revision'].unique()) + '"'
raise alferr.ALFMultipleRevisionsFound(rev_list)
# Square brackets forces 1 row DataFrame returned instead of Series
idx = index_last_before(df['revision'].tolist(), revision)
# return df.iloc[slice(0, 0) if idx is None else [idx], :]
# Square brackets forces 1 row DataFrame returned instead of Series
return df.iloc[slice(0, 0) if idx is None else [idx], :]

with pd.option_context('mode.chained_assignment', None): # FIXME Explicitly copy?
datasets['revision'] = [rel_path_parts(x)[1] or '' for x in datasets.rel_path]
# Extract revision to separate column
if 'revision' not in datasets.columns:
with pd.option_context('mode.chained_assignment', None): # FIXME Explicitly copy?
datasets['revision'] = datasets.rel_path.map(lambda x: rel_path_parts(x)[1] or '')
# Group by relative path (sans revision)
groups = datasets.rel_path.str.replace('#.*#/', '', regex=True).values
grouped = datasets.groupby(groups, group_keys=False)
return grouped.apply(_last_before)
filtered = grouped.apply(_last_before)
# Raise if matching revision is different between datasets
if len(filtered['revision'].unique()) > 1:
rev_list = '"' + '", "'.join(filtered['revision'].unique()) + '"'
if assert_consistent:
raise alferr.ALFMultipleRevisionsFound(rev_list)
else:
warnings.warn(f'Multiple revisions: {rev_list}', alferr.ALFWarning)
return filtered


def index_last_before(revisions: List[str], revision: Optional[str]) -> Optional[int]:
Expand Down

0 comments on commit 357b5f0

Please sign in to comment.