From 22781dcdbd6394a8ea87609f4326dae58fefb42c Mon Sep 17 00:00:00 2001 From: k1o0 Date: Fri, 25 Oct 2024 15:43:53 +0300 Subject: [PATCH 01/13] Resolves #872 and https://github.com/int-brain-lab/ONE/issues/134 --- .../migrations/0022_project_to_projects.py | 2 +- ..._collection_alter_dataset_hash_and_more.py | 32 +++++++++++++++++++ alyx/data/models.py | 8 ++--- alyx/data/transfers.py | 26 ++++++++------- alyx/data/views.py | 6 ++-- alyx/misc/management/commands/one_cache.py | 24 +++++--------- .../oneoff/2024-03-16-update_test_fixture.py | 11 ++++--- 7 files changed, 69 insertions(+), 40 deletions(-) create mode 100644 alyx/data/migrations/0021_alter_dataset_collection_alter_dataset_hash_and_more.py diff --git a/alyx/actions/migrations/0022_project_to_projects.py b/alyx/actions/migrations/0022_project_to_projects.py index ae408e62..70c7f66d 100644 --- a/alyx/actions/migrations/0022_project_to_projects.py +++ b/alyx/actions/migrations/0022_project_to_projects.py @@ -18,7 +18,7 @@ def project2projects(apps, schema_editor): sessions = Session.objects.exclude(Q(project__isnull=True) | Q(projects=F('project'))) # Check query worked - # from one.util import ensure_list + # from iblutil.util import ensure_list # for session in sessions.values('pk', 'project', 'projects'): # assert session['project'] not in ensure_list(session['projects']) diff --git a/alyx/data/migrations/0021_alter_dataset_collection_alter_dataset_hash_and_more.py b/alyx/data/migrations/0021_alter_dataset_collection_alter_dataset_hash_and_more.py new file mode 100644 index 00000000..6f609b23 --- /dev/null +++ b/alyx/data/migrations/0021_alter_dataset_collection_alter_dataset_hash_and_more.py @@ -0,0 +1,32 @@ +# Generated by Django 5.1.2 on 2024-11-01 17:53 + +import django.core.validators +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('data', '0020_alter_datarepository_timezone'), + ] + + operations = [ + migrations.AlterField( + model_name='dataset', + name='collection', + field=models.CharField(blank=True, default='', help_text='file subcollection or subfolder', max_length=255, validators=[django.core.validators.RegexValidator('^[\\w./-]+$', 'Collections must only contain letters, numbers, hyphens, underscores and forward slashes.')]), + preserve_default=False, + ), + migrations.AlterField( + model_name='dataset', + name='hash', + field=models.CharField(blank=True, default='', help_text='Hash of the data buffer, SHA-1 is 40 hex chars, while md5is 32 hex chars', max_length=64), + preserve_default=False, + ), + migrations.AlterField( + model_name='dataset', + name='version', + field=models.CharField(blank=True, default='', help_text='version of the algorithm generating the file', max_length=64), + preserve_default=False, + ), + ] diff --git a/alyx/data/models.py b/alyx/data/models.py index df205f05..964bc57f 100644 --- a/alyx/data/models.py +++ b/alyx/data/models.py @@ -251,7 +251,7 @@ class Revision(BaseModel): "Revisions must only contain letters, " "numbers, hyphens, underscores and forward slashes.") name = models.CharField(max_length=255, blank=True, help_text="Long name", - unique=True, validators=[name_validator]) + unique=True, null=False, validators=[name_validator]) description = models.CharField(max_length=1023, blank=True) created_datetime = models.DateTimeField(blank=True, null=True, default=timezone.now, help_text="created date") @@ -316,19 +316,19 @@ class Dataset(BaseExperimentalData): md5 = models.UUIDField(blank=True, null=True, help_text="MD5 hash of the data buffer") - hash = models.CharField(blank=True, null=True, max_length=64, + hash = models.CharField(blank=True, null=False, max_length=64, help_text=("Hash of the data buffer, SHA-1 is 40 hex chars, while md5" "is 32 hex chars")) # here we usually refer to version as an algorithm version such as ibllib-1.4.2 - version = models.CharField(blank=True, null=True, max_length=64, + version = models.CharField(blank=True, null=False, max_length=64, help_text="version of the algorithm generating the file") # the collection comprises session sub-folders collection_validator = RegexValidator(f"^{ALF_SPEC['collection']}$", "Collections must only contain letters, " "numbers, hyphens, underscores and forward slashes.") - collection = models.CharField(blank=True, null=True, max_length=255, + collection = models.CharField(blank=True, null=False, max_length=255, help_text='file subcollection or subfolder', validators=[collection_validator]) diff --git a/alyx/data/transfers.py b/alyx/data/transfers.py index bc6b744e..010e1c93 100644 --- a/alyx/data/transfers.py +++ b/alyx/data/transfers.py @@ -201,7 +201,7 @@ def _get_name_collection_revision(file, rel_dir_path): fullpath = Path(rel_dir_path).joinpath(file) try: info = folder_parts(fullpath.parent, as_dict=True) - if info['revision'] is not None: + if info['revision']: path_parts = fullpath.parent.parts assert path_parts.index(f"#{info['revision']}#") == len(path_parts) - 1 except AssertionError: @@ -218,25 +218,26 @@ def _get_name_collection_revision(file, rel_dir_path): info['full_path'] = fullpath.as_posix() info['filename'] = fullpath.name info['rel_dir_path'] = '{subject}/{date}/{number}'.format(**info) + info = {k: v or '' for k, v in info.items()} return info, None def _change_default_dataset(session, collection, filename): - dataset = Dataset.objects.filter(session=session, collection=collection, name=filename, - default_dataset=True) + dataset = Dataset.objects.filter( + session=session, collection=collection or '', name=filename, default_dataset=True) if dataset.count() > 0: dataset.update(default_dataset=False) def _check_dataset_protected(session, collection, filename): # Order datasets by the latest revision with the original one last - dataset = Dataset.objects.filter(session=session, collection=collection, - name=filename).order_by( + dataset = Dataset.objects.filter( + session=session, collection=collection or '', name=filename).order_by( F('revision__created_datetime').desc(nulls_last=True)) if dataset.count() == 0: return False, [] else: - protected = any([d.is_protected for d in dataset]) + protected = any(d.is_protected for d in dataset) protected_info = [{d.revision.name if d.revision else '': d.is_protected} for d in dataset] return protected, protected_info @@ -248,8 +249,9 @@ def _create_dataset_file_records( file_size=None, version=None, revision=None, default=None, qc=None): assert session is not None + collection = collection or '' revision_name = f'#{revision.name}#' if revision else '' - relative_path = PurePosixPath(rel_dir_path, collection or '', revision_name, filename) + relative_path = PurePosixPath(rel_dir_path, collection, revision_name, filename) dataset_type = get_dataset_type(filename, DatasetType.objects.all()) data_format = get_data_format(filename) assert dataset_type @@ -286,7 +288,7 @@ def _create_dataset_file_records( # The user doesn't have to be the same when getting an existing dataset, but we still # have to set the created_by field. dataset.created_by = user - if version is not None: + if version: dataset.version = version """ if a hash/filesize is provided, label the dataset with it @@ -295,8 +297,8 @@ def _create_dataset_file_records( If the hash doesn't exist and/or can't be verified, assume that the dataset is patched """ is_patched = True - if hash is not None: - if dataset.hash is not None: + if hash: + if dataset.hash: is_patched = not dataset.hash == hash dataset.hash = hash if file_size is not None: @@ -708,8 +710,8 @@ def _ls_globus(file_record, add_uuid=False): data_repository__globus_is_personal=False, data_repository__name__icontains='flatiron').first() if fr_server is None: - logger.warning(str(ds.session) + '/' + (ds.collection or '') + - '/' + ds.name + " doesnt exist on server - skipping") + logger.warning(str(ds.session) + '/' + ds.collection + + '/' + ds.name + " doesn't exist on server - skipping") continue ls_server = _ls_globus(fr_server, add_uuid=True) # if the file is not found on the remote server, do nothing diff --git a/alyx/data/views.py b/alyx/data/views.py index 6e2216c2..1f05d5f5 100644 --- a/alyx/data/views.py +++ b/alyx/data/views.py @@ -306,7 +306,7 @@ def _make_dataset_response(dataset): 'session_users': ','.join(_.username for _ in dataset.session.users.all()), 'session_start_time': dataset.session.start_time, 'collection': dataset.collection, - 'revision': getattr(dataset.revision, 'name', None), + 'revision': getattr(dataset.revision, 'name', ''), 'default': dataset.default_dataset, 'qc': dataset.qc } @@ -572,7 +572,7 @@ def create(self, request): if resp: return resp - if info['revision'] is not None: + if info['revision']: revision, _ = Revision.objects.get_or_create(name=info['revision']) else: revision = None @@ -580,7 +580,7 @@ def create(self, request): dataset, resp = _create_dataset_file_records( collection=info['collection'], rel_dir_path=info['rel_dir_path'], filename=info['filename'], session=session, user=user, repositories=repositories, - exists_in=exists_in, hash=hash, file_size=fsize, version=version, + exists_in=exists_in, hash=hash or '', file_size=fsize, version=version or '', revision=revision, default=default, qc=qc) if resp: return resp diff --git a/alyx/misc/management/commands/one_cache.py b/alyx/misc/management/commands/one_cache.py index 6f5e2166..4b197542 100644 --- a/alyx/misc/management/commands/one_cache.py +++ b/alyx/misc/management/commands/one_cache.py @@ -15,7 +15,7 @@ import pyarrow.parquet as pq import pyarrow as pa from tqdm import tqdm -from one.alf.cache import _metadata +from one.alf.cache import _metadata, SESSIONS_COLUMNS, DATASETS_COLUMNS from one.util import QC_TYPE from one.alf.spec import QC from one.remote.aws import get_s3_virtual_host @@ -32,7 +32,7 @@ from experiments.models import ProbeInsertion logger = logging.getLogger(__name__) -ONE_API_VERSION = '2.7' # Minimum compatible ONE api version +ONE_API_VERSION = '2.10' # Minimum compatible ONE api version def measure_time(func): @@ -345,7 +345,7 @@ def generate_sessions_frame(tags=None) -> pd.DataFrame: if query.count() == 0: logger.warning(f'No datasets associated with sessions found for {tags}, ' f'returning empty dataframe') - return + return pd.DataFrame(columns=SESSIONS_COLUMNS).set_index('id') df = pd.DataFrame.from_records(query.values(*fields).distinct()) logger.debug(f'Raw session frame = {getsizeof(df) / 1024**2} MiB') @@ -376,7 +376,6 @@ def generate_datasets_frame(tags=None, batch_size=100_000) -> pd.DataFrame: """DATASETS_COLUMNS = ( 'id', # uuid str 'eid', # uuid str - 'session_path', # relative to the root 'rel_path', # relative to the session path, includes the filename 'file_size', # float, bytes, optional 'hash', # sha1/md5 str, recomputed in load function @@ -392,7 +391,7 @@ def generate_datasets_frame(tags=None, batch_size=100_000) -> pd.DataFrame: exists=True, data_repository__name__startswith='aws').values('pk') # Fetch datasets and their related tables - ds = Dataset.objects.select_related('session', 'session__subject', 'session__lab', 'revision') + ds = Dataset.objects if tags: kw = {'tags__name__in' if not isinstance(tags, str) else 'tags__name': tags} ds = ds.prefetch_related('tag').filter(**kw) @@ -403,13 +402,14 @@ def generate_datasets_frame(tags=None, batch_size=100_000) -> pd.DataFrame: if ds.count() == 0: logger.warning(f'No datasets associated with sessions found for {tags}, ' f'returning empty dataframe') - return + return (pd.DataFrame(columns=DATASETS_COLUMNS) + .set_index(['eid', 'id']) + .astype({'qc': QC_TYPE})) # fields to keep from Dataset table fields = ( 'id', 'name', 'file_size', 'hash', 'collection', 'revision__name', 'default_dataset', - 'session__id', 'session__start_time__date', 'session__number', - 'session__subject__nickname', 'session__lab__name', 'exists_flatiron', 'exists_aws', 'qc' + 'session__id', 'qc' ) fields_map = {'session__id': 'eid', 'default_dataset': 'default_revision'} @@ -421,14 +421,6 @@ def generate_datasets_frame(tags=None, batch_size=100_000) -> pd.DataFrame: df = pd.DataFrame.from_records(current_qs.values(*fields)).rename(fields_map, axis=1) df['exists'] = True - # TODO New version without this nonsense - # session_path - globus_path = df.pop('session__lab__name') + '/Subjects' - subject = df.pop('session__subject__nickname') - date = df.pop('session__start_time__date').astype(str) - number = df.pop('session__number').apply(lambda x: str(x).zfill(3)) - df['session_path'] = globus_path.str.cat((subject, date, number), sep='/') - # relative_path revision = map(lambda x: None if not x else f'#{x}#', df.pop('revision__name')) zipped = zip(df.pop('collection'), revision, df.pop('name')) diff --git a/scripts/oneoff/2024-03-16-update_test_fixture.py b/scripts/oneoff/2024-03-16-update_test_fixture.py index 67a6ff14..325ab915 100644 --- a/scripts/oneoff/2024-03-16-update_test_fixture.py +++ b/scripts/oneoff/2024-03-16-update_test_fixture.py @@ -14,12 +14,14 @@ data = json.load(fp) # Get implant weight map -pk2iw = {r['pk']: r['fields']['implant_weight'] for r in filter(lambda r: r['model'] == 'subjects.subject', data)} +pk2iw = {r['pk']: r['fields']['implant_weight'] + for r in filter(lambda r: r['model'] == 'subjects.subject', data)} # Add implant weights to surgeries for record in filter(lambda r: r['model'] == 'actions.surgery', data): # Check if implant surgery - implant = any('implant' in p for p in record['fields'].get('procedures', [])) or 'headplate' in record['fields']['narrative'] + implant = (any('implant' in p for p in record['fields'].get('procedures', [])) or + 'headplate' in record['fields']['narrative']) # Implant weight should be subject's implant weight iw = pk2iw[record['fields']['subject']] if iw is None: # ... or a random float rounded to 2 decimal places @@ -33,10 +35,11 @@ # find any with multiple surgeries # from collections import Counter -# counter = Counter(map(lambda r: r['fields']['subject'], filter(lambda r: r['model'] == 'actions.surgery', data))) +# surgeries = filter(lambda r: r['model'] == 'actions.surgery', data) +# counter = Counter(map(lambda r: r['fields']['subject'], surgeries)) # pk, total = counter.most_common()[3] # assert total > 1 -# records = list(filter(lambda r: r['model'] == 'actions.surgery' and r['fields']['subject'] == pk, data)) +# recs = filter(lambda r: r['model'] == 'actions.surgery' and r['fields']['subject'] == pk, data) # Write to file with gzip.open(path, 'wt', encoding='UTF-8') as fp: From f930fbf4767ec477e779535a4b8867fcb3078e8f Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Fri, 1 Nov 2024 21:05:42 +0200 Subject: [PATCH 02/13] one.alf.files -> one.alf.path --- alyx/data/models.py | 2 +- alyx/data/transfers.py | 2 +- requirements.txt | 2 +- requirements_frozen.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/alyx/data/models.py b/alyx/data/models.py index 964bc57f..f8e42909 100644 --- a/alyx/data/models.py +++ b/alyx/data/models.py @@ -452,7 +452,7 @@ def data_url(self): root = self.data_repository.data_url if not root: return None - from one.alf.files import add_uuid_string + from one.alf.path import add_uuid_string return root + add_uuid_string(self.relative_path, self.dataset.pk).as_posix() def save(self, *args, **kwargs): diff --git a/alyx/data/transfers.py b/alyx/data/transfers.py index 010e1c93..ff9eadf4 100644 --- a/alyx/data/transfers.py +++ b/alyx/data/transfers.py @@ -9,7 +9,7 @@ from django.db.models import Case, When, Count, Q, F import globus_sdk import numpy as np -from one.alf.files import add_uuid_string, folder_parts +from one.alf.path import add_uuid_string, folder_parts from one.registration import get_dataset_type from one.alf.spec import QC diff --git a/requirements.txt b/requirements.txt index bfa1e696..b8c6885d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,4 @@ python-magic pytz structlog>=21.5.0 webdavclient3 -ONE-api>=2.7 +ONE-api>=2.11 diff --git a/requirements_frozen.txt b/requirements_frozen.txt index 0016983c..ba9f723d 100644 --- a/requirements_frozen.txt +++ b/requirements_frozen.txt @@ -52,7 +52,7 @@ matplotlib==3.7.5 mccabe==0.7.0 numba==0.58.1 numpy==1.24.4 -ONE-api==2.9.1 +ONE-api==2.11.0 packaging==24.1 pandas==2.0.3 pillow==10.4.0 From 17adae4c2644f2f2839349bb7c6c7e89ab89c910 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Fri, 1 Nov 2024 21:58:21 +0200 Subject: [PATCH 03/13] Do not cascade delete insertions when chronic insertion deleted --- alyx/experiments/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alyx/experiments/models.py b/alyx/experiments/models.py index db42de59..2e7be06f 100644 --- a/alyx/experiments/models.py +++ b/alyx/experiments/models.py @@ -109,7 +109,7 @@ class ProbeInsertion(BaseModel): verbose_name='last updated') datasets = models.ManyToManyField('data.Dataset', blank=True, related_name='probe_insertion') chronic_insertion = models.ForeignKey(ChronicInsertion, blank=True, null=True, - on_delete=models.CASCADE, related_name='probe_insertion') + on_delete=models.SET_NULL, related_name='probe_insertion') def __str__(self): return "%s %s" % (self.name, str(self.session)) From 0d8ba0e7d5959285d06d00531bdb166add6ff41a Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Sat, 2 Nov 2024 00:14:19 +0200 Subject: [PATCH 04/13] file_size and number fields uint in cache table --- alyx/misc/management/commands/one_cache.py | 23 +++++++++++----------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/alyx/misc/management/commands/one_cache.py b/alyx/misc/management/commands/one_cache.py index 4b197542..3e73aa9b 100644 --- a/alyx/misc/management/commands/one_cache.py +++ b/alyx/misc/management/commands/one_cache.py @@ -11,6 +11,7 @@ import tempfile import os +import numpy as np import pandas as pd import pyarrow.parquet as pq import pyarrow as pa @@ -351,20 +352,16 @@ def generate_sessions_frame(tags=None) -> pd.DataFrame: logger.debug(f'Raw session frame = {getsizeof(df) / 1024**2} MiB') # Rename, sort fields df['all_projects'] = df['all_projects'].map(lambda x: ','.join(filter(None, set(x)))) + # task_protocol & projects columns may be empty; ensure None -> '' + # id UUID objects -> str; not supported by parquet df = ( (df .rename(lambda x: x.split('__')[0], axis=1) .rename({'start_time': 'date', 'all_projects': 'projects'}, axis=1) .dropna(subset=['number', 'date', 'subject', 'lab']) # Remove dud or base sessions - .sort_values(['date', 'subject', 'number'], ascending=False)) + .sort_values(['date', 'subject', 'number'], ascending=False) + .astype({'number': np.uint16, 'task_protocol': str, 'projects': str, 'id': str})) ) - df['number'] = df['number'].astype(int) # After dropping nans we can convert number to int - # These columns may be empty; ensure None -> '' - for col in ('task_protocol', 'projects'): - df[col] = df[col].astype(str) - - # Convert UUID objects to str: not supported by parquet - df['id'] = df['id'].astype(str) df.set_index('id', inplace=True) logger.debug(f'Final session frame = {getsizeof(df) / 1024 ** 2:.1f} MiB') @@ -404,7 +401,7 @@ def generate_datasets_frame(tags=None, batch_size=100_000) -> pd.DataFrame: f'returning empty dataframe') return (pd.DataFrame(columns=DATASETS_COLUMNS) .set_index(['eid', 'id']) - .astype({'qc': QC_TYPE})) + .astype({'qc': QC_TYPE, 'file_size': np.uint64})) # fields to keep from Dataset table fields = ( @@ -418,7 +415,10 @@ def generate_datasets_frame(tags=None, batch_size=100_000) -> pd.DataFrame: for i in tqdm(paginator.page_range): data = paginator.get_page(i) current_qs = data.object_list - df = pd.DataFrame.from_records(current_qs.values(*fields)).rename(fields_map, axis=1) + df = (pd.DataFrame + .from_records(current_qs.values(*fields)) + .rename(fields_map, axis=1) + .astype({'id': str, 'eid': str, 'file_size': np.uint64})) df['exists'] = True # relative_path @@ -426,8 +426,7 @@ def generate_datasets_frame(tags=None, batch_size=100_000) -> pd.DataFrame: zipped = zip(df.pop('collection'), revision, df.pop('name')) df['rel_path'] = ['/'.join(filter(None, x)) for x in zipped] - # Convert UUIDs to str: not supported by parquet - df[['id', 'eid']] = df[['id', 'eid']].astype(str) + # UUIDs converted to str: not supported by parquet df = df.set_index(['eid', 'id']) # Convert QC enum int to pandas category From 00f3db279ae7b1ee02054514555dc6264a4108c7 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Thu, 14 Nov 2024 13:01:07 +0200 Subject: [PATCH 05/13] Nullable session field in Task serializer; lab name ALF validation --- ..._alter_probeinsertion_chronic_insertion.py | 19 +++++++++++++++++++ alyx/experiments/models.py | 4 ++-- alyx/jobs/admin.py | 11 +++++++---- alyx/jobs/serializers.py | 2 +- alyx/misc/migrations/0011_alter_lab_name.py | 19 +++++++++++++++++++ alyx/misc/models.py | 8 ++++++-- 6 files changed, 54 insertions(+), 9 deletions(-) create mode 100644 alyx/experiments/migrations/0014_alter_probeinsertion_chronic_insertion.py create mode 100644 alyx/misc/migrations/0011_alter_lab_name.py diff --git a/alyx/experiments/migrations/0014_alter_probeinsertion_chronic_insertion.py b/alyx/experiments/migrations/0014_alter_probeinsertion_chronic_insertion.py new file mode 100644 index 00000000..38ef3daa --- /dev/null +++ b/alyx/experiments/migrations/0014_alter_probeinsertion_chronic_insertion.py @@ -0,0 +1,19 @@ +# Generated by Django 5.1.2 on 2024-11-14 11:01 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('experiments', '0013_remove_trajectoryestimate_unique_trajectory_per_provenance_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='probeinsertion', + name='chronic_insertion', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='probe_insertion', to='experiments.chronicinsertion'), + ), + ] diff --git a/alyx/experiments/models.py b/alyx/experiments/models.py index 2e7be06f..dfe04724 100644 --- a/alyx/experiments/models.py +++ b/alyx/experiments/models.py @@ -108,8 +108,8 @@ class ProbeInsertion(BaseModel): auto_datetime = models.DateTimeField(auto_now=True, blank=True, null=True, verbose_name='last updated') datasets = models.ManyToManyField('data.Dataset', blank=True, related_name='probe_insertion') - chronic_insertion = models.ForeignKey(ChronicInsertion, blank=True, null=True, - on_delete=models.SET_NULL, related_name='probe_insertion') + chronic_insertion = models.ForeignKey(ChronicInsertion, blank=True, on_delete=models.SET_NULL, + null=True, related_name='probe_insertion') def __str__(self): return "%s %s" % (self.name, str(self.session)) diff --git a/alyx/jobs/admin.py b/alyx/jobs/admin.py index 9195a718..d78cfcf5 100644 --- a/alyx/jobs/admin.py +++ b/alyx/jobs/admin.py @@ -26,16 +26,19 @@ class TaskAdmin(BaseAdmin): def has_change_permission(self, request, obj=None): if request.user.is_superuser: return True - if obj: + if obj and obj.session: return obj.session.lab.name in request.user.lab + else: + return False def session_projects(self, obj): - if obj.session.projects is not None: - return obj.session.projects.name + session = obj.session + if session and session.projects is not None: + return session.projects.name session_projects.short_description = 'projects' def session_task_protocol(self, obj): - return obj.session.task_protocol + return obj.session.task_protocol if obj.session else None session_task_protocol.short_description = 'task_protocol' def session_str(self, obj): diff --git a/alyx/jobs/serializers.py b/alyx/jobs/serializers.py index 552b376c..c3ed820d 100644 --- a/alyx/jobs/serializers.py +++ b/alyx/jobs/serializers.py @@ -12,7 +12,7 @@ class TaskSerializer(serializers.ModelSerializer): ) session = serializers.SlugRelatedField( read_only=False, required=False, slug_field='id', many=False, - queryset=Session.objects.all() + queryset=Session.objects.all(), allow_null=True ) data_repository = serializers.SlugRelatedField( read_only=False, required=False, slug_field='name', many=False, diff --git a/alyx/misc/migrations/0011_alter_lab_name.py b/alyx/misc/migrations/0011_alter_lab_name.py new file mode 100644 index 00000000..bb49e575 --- /dev/null +++ b/alyx/misc/migrations/0011_alter_lab_name.py @@ -0,0 +1,19 @@ +# Generated by Django 5.1.2 on 2024-11-14 11:01 + +import django.core.validators +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('misc', '0010_alter_lab_timezone'), + ] + + operations = [ + migrations.AlterField( + model_name='lab', + name='name', + field=models.CharField(max_length=255, unique=True, validators=[django.core.validators.RegexValidator('^\\w+$', 'Lab name must only contain letters, numbers, and underscores.')]), + ), + ] diff --git a/alyx/misc/models.py b/alyx/misc/models.py index 8236a28f..7671abc2 100644 --- a/alyx/misc/models.py +++ b/alyx/misc/models.py @@ -11,13 +11,14 @@ from django.contrib.auth import get_user_model from django.db import models from django.conf import settings +from django.core import validators from django.contrib.auth.models import AbstractUser from django.contrib.contenttypes.fields import GenericForeignKey from django.contrib.contenttypes.models import ContentType from django.core.files.uploadedfile import InMemoryUploadedFile from django.utils import timezone -from alyx.base import BaseModel, modify_fields +from alyx.base import BaseModel, modify_fields, ALF_SPEC from alyx.settings import TIME_ZONE, UPLOADED_IMAGE_WIDTH, DEFAULT_LAB_PK @@ -70,7 +71,10 @@ def get_allowed_subjects(self, subjects_queryset=None): class Lab(BaseModel): - name = models.CharField(max_length=255, unique=True) + labname_validator = validators.RegexValidator( + f"^{ALF_SPEC['lab']}$", + "Lab name must only contain letters, numbers, and underscores.") + name = models.CharField(max_length=255, unique=True, validators=[labname_validator]) institution = models.CharField(max_length=255, blank=True) address = models.CharField(max_length=255, blank=True) timezone = models.CharField( From 728a52349e15358acb13afce6d6a6e62e7ac6511 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Thu, 14 Nov 2024 14:14:38 +0200 Subject: [PATCH 06/13] Resolves #875 --- README.md | 4 +-- alyx/alyx/settings_template.py | 2 +- docs/gettingstarted.md | 54 ++++++++++++++++++++++++++++++++-- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 4a1a58a3..c18aff6e 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Database for experimental neuroscience laboratories -Documentation: http://alyx.readthedocs.io +Documentation: [Installation and getting started](http://alyx.readthedocs.io), [Alyx usage guide](https://docs.google.com/document/d/1cx3XLZiZRh3lUzhhR_p65BggEqTKpXHUDkUDagvf9Kc/edit?usp=sharing) ## Installation @@ -16,7 +16,7 @@ this setup will work on other systems. Assumptions made are that you have sudo p - installing the Python/Django environment - serving a local database - registering local data -- accessing local data using [ONE]() +- accessing local data using [ONE](https://one.internationalbrainlab.org) ## Contribution diff --git a/alyx/alyx/settings_template.py b/alyx/alyx/settings_template.py index 7d841e9a..f3343e16 100644 --- a/alyx/alyx/settings_template.py +++ b/alyx/alyx/settings_template.py @@ -123,7 +123,7 @@ # See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/ # SECURITY WARNING: don't run with debug turned on in production! -DEBUG = False +DEBUG = True # Production settings: if not DEBUG: diff --git a/docs/gettingstarted.md b/docs/gettingstarted.md index 28f4aee1..eea28505 100644 --- a/docs/gettingstarted.md +++ b/docs/gettingstarted.md @@ -64,6 +64,8 @@ NB: the password above is the postgres database user password. It is used by Dja You can then visit http://localhost:8000/admin, connect as `admin:admin` (ie. username admin and password admin) and update your admin interface password. +[!WARNING] +Alyx is by default in debug mode, meaning it is not safe to run on the the open Web. To run securly, open the `alyx/alyx/settings.py` file and set `DEBUG=False`. This enables https redirection (SSL certificates required) and various cross-site scripting protections. Debug mode is adequate if running Alyx on a local network or secure institute intranet. ### macOS @@ -91,7 +93,8 @@ You can then visit http://localhost:8000/admin, connect as `admin:admin` (ie. us * To run the development server, type `python alyx/manage.py runserver` * Go to `http://localhost:8000/admin/` - +[!WARNING] +Alyx is by default in debug mode, meaning it is not safe to run on the the open Web. To run securly, open the `alyx/alyx/settings.py` file and set `DEBUG=False`. This enables https redirection (SSL certificates required) and various cross-site scripting protections. Debug mode is adequate if running Alyx on a local network or secure institute intranet. ## Interaction with the database @@ -103,7 +106,7 @@ There are 3 main ways to interact with the database, listed below: | **Admin Web Page** | web client | anyone | Manual way to input data in the database. This is privilegied for users needing to add/amend/correct metadata related to subjects. For the local database, this is accessible here: http://localhost:8000/admin. | **REST** | web client | anyone | Programmatical way to input data, typically by acquisition software using a dedicated Alyx client [ONE](https://github.com/int-brain-lab/ONE) (Python) or [ALyx-matlab](https://github.com/cortex-lab/alyx-matlab) (Matlab). - +For detailed information on using the Alyx admin Web interface, see [this Alyx usage guide](https://docs.google.com/document/d/1cx3XLZiZRh3lUzhhR_p65BggEqTKpXHUDkUDagvf9Kc/edit?usp=sharing). ### Create an experiment, register data and access it locally @@ -208,4 +211,51 @@ print(local_files) We went straight to the point here, which was to create a session and register data, to go further consult the [One documentation](https://int-brain-lab.github.io/ONE/), in the section "Using one in Alyx". +## Backing up the database +See [this section](https://docs.google.com/document/d/1cx3XLZiZRh3lUzhhR_p65BggEqTKpXHUDkUDagvf9Kc/edit?tab=t.0#heading=h.dibimc48a9xl) in the Alyx user guide on how to back up and restore the database. + +## Updating the database +The database should be updated each time there is a new Alyx release. + +1. Pull the changes from GitHub +```shell +cd alyx/ # ensure you are within the Alyx git repository +git stash # stash any local changes (if required) +git checkout master # ensure you're on the main branch +git pull # fetch and merge any new code changes +git stash pop # restore any local changes (if required) +``` + +2. Activate environment - install any new requirements +```shell +source ./alyxvenv/bin/activate +pip install -r requirements.txt +``` + +3. Update the database if any scheme changes - we expect no migrations +```shell +cd alyx +./manage.py makemigrations +./manage.py migrate +``` + +4. If new fixtures load in the database: +```shell +../scripts/load-init-fixtures.sh +``` +5. If new tables were added, change the postgres permissions +```shell +./manage.py set_db_permissions +./manage.py set_user_permissions +``` + +6. If there were updates to the Django version +```shell +./manage.py collectstatic --no-input +``` + +7. Restart the Apache server +```shell +sudo service apache2 restart +``` \ No newline at end of file From 65a78ebb5632eca159ff019ba72033baaf8b0010 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Thu, 14 Nov 2024 14:47:40 +0200 Subject: [PATCH 07/13] Improve task change permission check --- alyx/jobs/admin.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/alyx/jobs/admin.py b/alyx/jobs/admin.py index d78cfcf5..f60f02e7 100644 --- a/alyx/jobs/admin.py +++ b/alyx/jobs/admin.py @@ -4,6 +4,7 @@ DropdownFilter, ChoiceDropdownFilter, RelatedDropdownFilter) from jobs.models import Task +from misc.models import Lab from alyx.base import BaseAdmin, get_admin_url @@ -26,8 +27,15 @@ class TaskAdmin(BaseAdmin): def has_change_permission(self, request, obj=None): if request.user.is_superuser: return True - if obj and obj.session: - return obj.session.lab.name in request.user.lab + if obj: + if obj.session: + # Check if session user or member of the same lab + is_session_user = obj.session.users.users.contains(request.user) + return is_session_user or obj.session.lab.name in request.user.lab + else: + # Check if user is member of the lab associated with the task repository + labs = request.user.lab_id() + return any(labs.filter(repositories=obj.data_repository)) else: return False From 15b308dc835aa0163508b249bb26e8a2df7510b1 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Thu, 14 Nov 2024 14:56:47 +0200 Subject: [PATCH 08/13] Bump version --- alyx/alyx/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alyx/alyx/__init__.py b/alyx/alyx/__init__.py index 1069adb2..4703ed28 100644 --- a/alyx/alyx/__init__.py +++ b/alyx/alyx/__init__.py @@ -1 +1 @@ -VERSION = __version__ = '3.0.3' +VERSION = __version__ = '3.1.3' From a8aa9ea1c6163002bf9392d2352746803c360684 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Thu, 14 Nov 2024 18:50:07 +0200 Subject: [PATCH 09/13] Tests --- README.md | 4 +- alyx/actions/serializers.py | 7 +++ alyx/data/models.py | 2 +- alyx/data/serializers.py | 11 +++- alyx/data/tests.py | 108 +++++++++++++++++++++++++++++++++++- alyx/data/tests_rest.py | 8 +-- alyx/jobs/admin.py | 1 - 7 files changed, 129 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index c18aff6e..3d771524 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,6 @@ this setup will work on other systems. Assumptions made are that you have sudo p - `./manage.py test` test with migrations (recommended if model changes) - NB: When running tests ensure `DEBUG = True` in the settings.py file (specifically `SECURE_SSL_REDIRECT = True` causes REST tests to fail) -``` -$ /manage.py test -n +```shell +./manage.py test -n ``` diff --git a/alyx/actions/serializers.py b/alyx/actions/serializers.py index 0809e779..75da211d 100644 --- a/alyx/actions/serializers.py +++ b/alyx/actions/serializers.py @@ -100,6 +100,13 @@ class SessionDatasetsSerializer(serializers.ModelSerializer): default_revision = serializers.CharField(source='default_dataset') qc = BaseSerializerEnumField(required=False) + def to_representation(self, instance): + """Override the default to_representation method to null the revision field.""" + representation = super().to_representation(instance) + if representation.get('revision') is None: + representation['revision'] = '' + return representation + class Meta: list_serializer_class = FilterDatasetSerializer model = Dataset diff --git a/alyx/data/models.py b/alyx/data/models.py index f8e42909..6b6674c0 100644 --- a/alyx/data/models.py +++ b/alyx/data/models.py @@ -386,7 +386,7 @@ def __str__(self): def save(self, *args, **kwargs): # when a dataset is saved / created make sure the probe insertion is set in the reverse m2m super(Dataset, self).save(*args, **kwargs) - if self.collection is None: + if not self.collection: return self.clean_fields() # Validate collection field from experiments.models import ProbeInsertion, FOV diff --git a/alyx/data/serializers.py b/alyx/data/serializers.py index c455611f..57f24c0d 100644 --- a/alyx/data/serializers.py +++ b/alyx/data/serializers.py @@ -141,7 +141,7 @@ class DatasetSerializer(serializers.HyperlinkedModelSerializer): hash = serializers.CharField(required=False, allow_null=True) version = serializers.CharField(required=False, allow_null=True) file_size = serializers.IntegerField(required=False, allow_null=True) - collection = serializers.CharField(required=False, allow_null=True) + collection = serializers.CharField(required=False, allow_blank=True, allow_null=True) default_dataset = serializers.BooleanField(required=False, allow_null=True) public = serializers.ReadOnlyField() protected = serializers.ReadOnlyField() @@ -178,7 +178,7 @@ def get_experiment_number(self, obj): def create(self, validated_data): # Get out some useful info # revision = validated_data.get('revision', None) - collection = validated_data.get('collection', None) + collection = validated_data.get('collection', '') name = validated_data.get('name', None) default = validated_data.get('default_dataset', None) session = validated_data.get('session', None) @@ -213,6 +213,13 @@ def create(self, validated_data): return super(DatasetSerializer, self).create(validated_data) + def to_representation(self, instance): + """Override the default to_representation method to null the revision field.""" + representation = super().to_representation(instance) + if representation.get('revision') is None: + representation['revision'] = '' + return representation + class Meta: model = Dataset fields = ('url', 'name', 'created_by', 'created_datetime', diff --git a/alyx/data/tests.py b/alyx/data/tests.py index 4c0c80aa..5f4cced0 100644 --- a/alyx/data/tests.py +++ b/alyx/data/tests.py @@ -1,5 +1,5 @@ from unittest import mock -from pathlib import Path +from pathlib import PurePosixPath from uuid import uuid4 from datetime import datetime, timedelta @@ -8,12 +8,15 @@ from django.db import transaction from django.db.utils import IntegrityError from django.db.models import ProtectedError +from rest_framework.response import Response +from one.alf.path import add_uuid_string from data.management.commands import files from data.models import Dataset, DatasetType, Tag, Revision, DataRepository, FileRecord from subjects.models import Subject from actions.models import Session from misc.models import Lab +from data import transfers from data.transfers import get_dataset_type @@ -201,4 +204,105 @@ def _new_delete_client(self, _, gid, **kwargs): @staticmethod def _dataset_uuid_name(dataset): - return f'{Path(dataset.name).stem}.{dataset.pk}{Path(dataset.name).suffix}' + return add_uuid_string(dataset.name, dataset.pk).as_posix() + + +class TestTransfers(TestCase): + """Tests for the data.transfers module.""" + + def setUp(self): + """Create some data repositories and file records to clean up.""" + # Two of these are 'large' datasets that will be removed + dtypes = ['ephysData.raw.ap', 'imaging.frames', 'foo.bar.baz'] + self.dtypes = [DatasetType.objects.create(name=name) for name in dtypes] + # Create two labs + self.labs = [Lab.objects.create(name=f'lab{i}') for i in range(2)] + # Create four repos + repo1 = DataRepository.objects.create( + name='lab0_local0', lab=self.labs[0], globus_is_personal=True, + globus_endpoint_id=uuid4(), globus_path='/mnt/foo/') + repo2 = DataRepository.objects.create( + name='lab0_local1', lab=self.labs[0], globus_is_personal=True, + globus_endpoint_id=uuid4(), globus_path='/mnt/foo/') + repo3 = DataRepository.objects.create( + name='lab1_local', lab=self.labs[1], globus_is_personal=True, + globus_endpoint_id=uuid4(), globus_path='/mnt/foo/') + # NB: name must contain 'flatiron'! + repo_main = DataRepository.objects.create( + name='flatiron', globus_is_personal=False, + globus_endpoint_id=uuid4(), globus_path='/mnt/foo/') + # Create one session per lab + self.subjects = [ + Subject.objects.create( + nickname=f'subject{i}', lab=lab) for i, lab in enumerate(self.labs)] + sessions = [Session.objects.create( + subject=sub, number=1, lab=lab) for lab, sub in zip(self.labs, self.subjects)] + # Create datasets and file records + self.dset_names = ['ephysData.raw.ap.bin', 'imaging.frames.tar.bz2', 'foo.bar.baz'] + self.dsets = [] + for session in sessions: # for one session in each lab, create one of each dataset + self.dsets.extend( + Dataset.objects.create(name=name, session=session, + dataset_type=next(x for x in self.dtypes if x.name in name)) + for name in self.dset_names) + + # Create file record on each lab's local server and main repo + session = 'subject/2020-01-01/001' + self.records = [] # All file records + for d in self.dsets: + for i, repo in enumerate((repo1, repo2, repo3, repo_main)): + if repo.globus_is_personal is False: + rel_path = f'{session}/{TestManagementFiles._dataset_uuid_name(d)}' + if i == 0: + rel_path = 'Data2/' + rel_path + if i == 1: + rel_path = '/' + rel_path + elif repo.lab != d.session.lab: + continue # Don't create file record for dataset if session lab different + else: + rel_path = f'{session}/{d.name}' + self.records.append( + FileRecord.objects.create( + relative_path=rel_path, exists=True, dataset=d, data_repository=repo) + ) + + def test_get_absolute_path(self): + expected = '/mnt/foo/subject/2020-01-01/001/ephysData.raw.ap.bin' + self.assertEqual(expected, transfers._get_absolute_path(self.records[0])) + expected = '/mnt/foo/subject/2020-01-01/001/ephysData.raw.ap.bin' + self.assertEqual(expected, transfers._get_absolute_path(self.records[1])) + + def test_get_name_collection_revision(self): + relative_path = PurePosixPath(self.records[0].relative_path) + info, resp = transfers._get_name_collection_revision( + relative_path.name, relative_path.parent.as_posix()) + self.assertIsNone(resp) + expected = { + 'lab': '', 'subject': 'subject', 'date': '2020-01-01', 'number': '001', + 'collection': '', 'revision': '', 'filename': 'ephysData.raw.ap.bin', + 'full_path': 'subject/2020-01-01/001/ephysData.raw.ap.bin', + 'rel_dir_path': 'subject/2020-01-01/001'} + self.assertDictEqual(info, expected) + relative_path = relative_path.parent / 'alf' / '#2020-10-01#' / relative_path.name + expected.update( + {'collection': 'alf', 'revision': '2020-10-01', + 'full_path': relative_path.as_posix()} + ) + info, resp = transfers._get_name_collection_revision( + relative_path.name, relative_path.parent.as_posix()) + self.assertIsNone(resp) + self.assertDictEqual(info, expected) + + relative_path = relative_path.parent / 'invalid' / relative_path.name + info, resp = transfers._get_name_collection_revision( + relative_path.name, relative_path.parent.as_posix()) + self.assertIsNone(info) + self.assertIsInstance(resp, Response) + self.assertEqual(resp.status_code, 400) + self.assertIn('Invalid ALF path', resp.data['detail']) + info, resp = transfers._get_name_collection_revision( + relative_path.name, 'subject/1-1-03/1/@lf') + self.assertIsNone(info) + self.assertIsInstance(resp, Response) + self.assertEqual(resp.status_code, 400) + self.assertIn('Invalid ALF path', resp.data['detail']) diff --git a/alyx/data/tests_rest.py b/alyx/data/tests_rest.py index 43e3a558..c7e8d2c0 100644 --- a/alyx/data/tests_rest.py +++ b/alyx/data/tests_rest.py @@ -142,8 +142,8 @@ def test_dataset(self): r = self.post(reverse('dataset-list'), data) self.ar(r, 201) # Check collection and revision have been set to default values - self.assertEqual(r.data['revision'], None) - self.assertEqual(r.data['collection'], None) + self.assertEqual(r.data['revision'], '') + self.assertEqual(r.data['collection'], '') # Check that it has been set as the default dataset self.assertEqual(r.data['default_dataset'], True) # Check QC value is NOT_SET by default @@ -170,7 +170,7 @@ def test_dataset(self): r = self.post(reverse('dataset-list'), data) self.ar(r, 201) - self.assertEqual(r.data['revision'], None) + self.assertEqual(r.data['revision'], '') self.assertEqual(r.data['collection'], data['collection']) self.assertEqual(r.data['default_dataset'], True) self.assertEqual(r.data['qc'], 'PASS') @@ -189,7 +189,7 @@ def test_dataset(self): self.assertEqual(r['default_dataset'], False) # Make sure if you specify the default dataset flag to false it is indeed false - data['collection'] = None + data['collection'] = '' data['default_dataset'] = False r = self.post(reverse('dataset-list'), data) self.ar(r, 201) diff --git a/alyx/jobs/admin.py b/alyx/jobs/admin.py index f60f02e7..7c2dad0b 100644 --- a/alyx/jobs/admin.py +++ b/alyx/jobs/admin.py @@ -4,7 +4,6 @@ DropdownFilter, ChoiceDropdownFilter, RelatedDropdownFilter) from jobs.models import Task -from misc.models import Lab from alyx.base import BaseAdmin, get_admin_url From 310916a2df23958e042fb27514d99701ab568b04 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Fri, 15 Nov 2024 13:04:35 +0200 Subject: [PATCH 10/13] Tasks management command --- alyx/data/management/commands/files.py | 2 +- alyx/jobs/management/__init__.py | 0 alyx/jobs/management/commands/__init__.py | 0 alyx/jobs/management/commands/tasks.py | 67 +++++++++++++++++++++ alyx/jobs/tests.py | 73 +++++++++++++++++++++++ docs/gettingstarted.md | 47 +-------------- scripts/auto-update.sh | 12 ++-- 7 files changed, 151 insertions(+), 50 deletions(-) create mode 100644 alyx/jobs/management/__init__.py create mode 100644 alyx/jobs/management/commands/__init__.py create mode 100644 alyx/jobs/management/commands/tasks.py diff --git a/alyx/data/management/commands/files.py b/alyx/data/management/commands/files.py index 5878af22..61962d85 100644 --- a/alyx/data/management/commands/files.py +++ b/alyx/data/management/commands/files.py @@ -113,7 +113,7 @@ def handle(self, *args, **options): "files on local server. Exiting now.")) return if before is None: - self.stdout.write(self.style.ERROR("Date beforeshould be specified: use the " + self.stdout.write(self.style.ERROR("Date before should be specified: use the " "--before=yyyy-mm-dd flag. Exiting now.")) return dtypes = ['ephysData.raw.ap', 'ephysData.raw.lf', 'ephysData.raw.nidq', diff --git a/alyx/jobs/management/__init__.py b/alyx/jobs/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/alyx/jobs/management/commands/__init__.py b/alyx/jobs/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/alyx/jobs/management/commands/tasks.py b/alyx/jobs/management/commands/tasks.py new file mode 100644 index 00000000..9cabd8e8 --- /dev/null +++ b/alyx/jobs/management/commands/tasks.py @@ -0,0 +1,67 @@ +from datetime import date + +from django.core.management import BaseCommand + +from jobs.models import Task + + +class Command(BaseCommand): + """ + ./manage.py tasks cleanup --before 2020-01-01 --status=20 --dry + ./manage.py tasks cleanup --status=Waiting --dry + ./manage.py tasks cleanup --status=~Complete --limit=200 + """ + help = 'Manage tasks' + + def add_arguments(self, parser): + parser.add_argument('action', help='Action') + parser.add_argument('--status', help='Only delete tasks with this status') + parser.add_argument('--dry', action='store_true', help='dry run') + parser.add_argument('--limit', help='limit to a maximum number of tasks') + parser.add_argument('--before', help='select tasks before a given date') + + def handle(self, *args, **options): + action = options.get('action') + + dry = options.get('dry') + before = options.get('before', date.today().isoformat()) + + if action != 'cleanup': + raise ValueError(f'Action "{action}" not recognized') + + before = date.fromisoformat(before) # validate + tasks = Task.objects.filter(datetime__date__lte=before) + # Filter status + if status := options.get('status'): + if status.startswith('~'): + status = status[1:] + fcn = tasks.exclude + else: + fcn = tasks.filter + if status.isnumeric(): + status = int(status) + if status not in {s[0] for s in Task.STATUS_DATA_SOURCES}: + raise ValueError(f'Status {status} not recognized') + else: # convert status string to int + status = next( + (i for i, s in Task.STATUS_DATA_SOURCES if s.casefold() == status.casefold()), + None + ) + if status is None: + raise ValueError(f'Status "{status}" not recognized') + tasks = fcn(status=status) + + # Limit + if (limit := options.get('limit')) is not None: + limit = int(limit) + tasks = tasks.order_by('datetime')[:limit] + + self.stdout.write(self.style.SUCCESS(f'Found {tasks.count()} tasks to delete')) + if not dry: + if limit is None: + tasks.delete() + else: + pks = tasks.values_list('pk', flat=True) + Task.objects.filter(pk__in=pks).delete() + self.stdout.write(self.style.SUCCESS('Tasks deleted')) + diff --git a/alyx/jobs/tests.py b/alyx/jobs/tests.py index a7892291..1ae169c5 100644 --- a/alyx/jobs/tests.py +++ b/alyx/jobs/tests.py @@ -1,10 +1,15 @@ +from unittest.mock import patch + from django.contrib.auth import get_user_model from django.urls import reverse from django.core.management import call_command +from datetime import datetime, timedelta from actions.models import Session from alyx.base import BaseTests from data.models import DataRepository +from jobs.management.commands import tasks +from jobs.models import Task class APISubjectsTests(BaseTests): @@ -32,3 +37,71 @@ def test_brain_regions_rest_filter(self): 'arguments': {'titi': 'toto', 'tata': 'tutu'}, 'data_repository': 'myrepo'} rep = self.post(reverse('tasks-list'), task_dict) self.assertEqual(rep.status_code, 201) + + +class TestManagementTasks(BaseTests): + """Tests for the tasks management command.""" + + def setUp(self) -> None: + """Create some tasks to clean up.""" + self.n_tasks = 100 + self.command = tasks.Command() + base = datetime.today() + # Create equally-spaced task dates + date_list = [base - timedelta(days=x) for x in range(self.n_tasks)] + with patch('django.db.models.fields.timezone.now') as timezone_mock: + for i, date in enumerate(date_list): + timezone_mock.return_value = date + status = Task.STATUS_DATA_SOURCES[i % len(Task.STATUS_DATA_SOURCES)][0] + Task.objects.create(name=f'task_{i}', status=status) + + def test_cleanup(self): + """Test for cleanup action.""" + # First run in dry mode, expect submit_delete to not be called + n = self.n_tasks - 10 + before_date = (datetime.today() - timedelta(days=n)).date() + with patch.object(self.command.stdout, 'write') as stdout_mock: + self.command.handle(action='cleanup', before=str(before_date), dry=True) + stdout_mock.assert_called() + self.assertIn(f'Found {10} tasks to delete', stdout_mock.call_args.args[0]) + # All tasks should still exist + self.assertEqual(self.n_tasks, Task.objects.count()) + + # Without dry flag, tasks should be removed + self.command.handle(action='cleanup', before=str(before_date)) + # All tasks should still exist + self.assertEqual(n, Task.objects.count()) + self.assertEqual(0, Task.objects.filter(datetime__date__lte=before_date).count()) + + # With status filter as int + n = Task.objects.count() - Task.objects.filter(status=20).count() + self.command.handle(action='cleanup', status='20') + self.assertEqual(n, Task.objects.count()) + self.assertEqual(0, Task.objects.filter(status=20).count()) + + # With status filter as string + n = Task.objects.count() - Task.objects.filter(status=40).count() + self.command.handle(action='cleanup', status='Errored') + self.assertEqual(n, Task.objects.count()) + self.assertEqual(0, Task.objects.filter(status=40).count()) + + # With status filter as string and ~ + n_days = self.n_tasks - 20 + before_date = (datetime.today() - timedelta(days=n_days)).date() + n = Task.objects.count() + n -= Task.objects.exclude(status=45).filter(datetime__date__lte=before_date).count() + self.command.handle(action='cleanup', status='~Abandoned', before=str(before_date)) + self.assertEqual(n, Task.objects.count()) + n_tasks = Task.objects.exclude(status=45).filter(datetime__date__lte=before_date).count() + self.assertEqual(0, n_tasks) + self.assertTrue(Task.objects.filter(status=45, datetime__date__lte=before_date).count()) + + # With status filter as int and ~ with limit + n = Task.objects.exclude(status=60).count() - 5 + self.command.handle(action='cleanup', status='~60', limit='5') + self.assertEqual(n, Task.objects.exclude(status=60).count()) + + # Error handling + self.assertRaises(ValueError, self.command.handle, action='cleanup', status='NotAStatus') + self.assertRaises(ValueError, self.command.handle, action='cleanup', status='-1000') + self.assertRaises(ValueError, self.command.handle, action='NotAnAction') diff --git a/docs/gettingstarted.md b/docs/gettingstarted.md index eea28505..c8c7054f 100644 --- a/docs/gettingstarted.md +++ b/docs/gettingstarted.md @@ -212,50 +212,7 @@ print(local_files) We went straight to the point here, which was to create a session and register data, to go further consult the [One documentation](https://int-brain-lab.github.io/ONE/), in the section "Using one in Alyx". ## Backing up the database -See [this section](https://docs.google.com/document/d/1cx3XLZiZRh3lUzhhR_p65BggEqTKpXHUDkUDagvf9Kc/edit?tab=t.0#heading=h.dibimc48a9xl) in the Alyx user guide on how to back up and restore the database. +See [this section](https://docs.google.com/document/d/1cx3XLZiZRh3lUzhhR_p65BggEqTKpXHUDkUDagvf9Kc/edit?tab=t.0#heading=h.dibimc48a9xl) in the Alyx user guide on how to back up and restore the database. There are scripts in `alyx/scripts/templates/` for exporting the database to a sql file and importing from said file. ## Updating the database -The database should be updated each time there is a new Alyx release. - -1. Pull the changes from GitHub -```shell -cd alyx/ # ensure you are within the Alyx git repository -git stash # stash any local changes (if required) -git checkout master # ensure you're on the main branch -git pull # fetch and merge any new code changes -git stash pop # restore any local changes (if required) -``` - -2. Activate environment - install any new requirements -```shell -source ./alyxvenv/bin/activate -pip install -r requirements.txt -``` - -3. Update the database if any scheme changes - we expect no migrations -```shell -cd alyx -./manage.py makemigrations -./manage.py migrate -``` - -4. If new fixtures load in the database: -```shell -../scripts/load-init-fixtures.sh -``` - -5. If new tables were added, change the postgres permissions -```shell -./manage.py set_db_permissions -./manage.py set_user_permissions -``` - -6. If there were updates to the Django version -```shell -./manage.py collectstatic --no-input -``` - -7. Restart the Apache server -```shell -sudo service apache2 restart -``` \ No newline at end of file +The database should be updated each time there is a new Alyx release. There is an update script in `alyx/scripts/auto-update.sh`, although you may need to change the source and cd command paths. diff --git a/scripts/auto-update.sh b/scripts/auto-update.sh index d8c4e76f..5b301ba9 100644 --- a/scripts/auto-update.sh +++ b/scripts/auto-update.sh @@ -5,13 +5,17 @@ cd /var/www/alyx-main/alyx git stash git pull git stash pop -# 3/ update database if scheme changes +# 3/ install any new requirements +pip install -r requirements.txt +# 4/ update database if scheme changes ./manage.py makemigrations ./manage.py migrate -# 4/ If new fixtures load them in the database +# 5/ If new fixtures load them in the database ../scripts/load-init-fixtures.sh -# 5/ if new tables change the postgres permissions +# 6/ if new tables change the postgres permissions ./manage.py set_db_permissions ./manage.py set_user_permissions -# 6/ restart the apache server +# 7/ if there were updates to the Django version collect the static files +./manage.py collectstatic --no-input +# 8/ restart the apache server sudo service apache2 reload From a0f5ba56378b0fc4af8dfb8b10235d456105a650 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Fri, 15 Nov 2024 13:57:26 +0200 Subject: [PATCH 11/13] Argument for removing signed-off session tasks --- alyx/jobs/management/commands/tasks.py | 7 ++++++- alyx/jobs/tests.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/alyx/jobs/management/commands/tasks.py b/alyx/jobs/management/commands/tasks.py index 9cabd8e8..d91b1a2e 100644 --- a/alyx/jobs/management/commands/tasks.py +++ b/alyx/jobs/management/commands/tasks.py @@ -19,6 +19,8 @@ def add_arguments(self, parser): parser.add_argument('--dry', action='store_true', help='dry run') parser.add_argument('--limit', help='limit to a maximum number of tasks') parser.add_argument('--before', help='select tasks before a given date') + parser.add_argument('--signed-off', action='store_true', + help='select tasks associated with signed-off sessions') def handle(self, *args, **options): action = options.get('action') @@ -51,6 +53,10 @@ def handle(self, *args, **options): raise ValueError(f'Status "{status}" not recognized') tasks = fcn(status=status) + # Filter signed-off + if options.get('signed_off'): + tasks = tasks.filter(session__json__sign_off_checklist__sign_off_date__isnull=False) + # Limit if (limit := options.get('limit')) is not None: limit = int(limit) @@ -64,4 +70,3 @@ def handle(self, *args, **options): pks = tasks.values_list('pk', flat=True) Task.objects.filter(pk__in=pks).delete() self.stdout.write(self.style.SUCCESS('Tasks deleted')) - diff --git a/alyx/jobs/tests.py b/alyx/jobs/tests.py index 1ae169c5..a2145c0a 100644 --- a/alyx/jobs/tests.py +++ b/alyx/jobs/tests.py @@ -10,6 +10,8 @@ from data.models import DataRepository from jobs.management.commands import tasks from jobs.models import Task +from subjects.models import Subject +from misc.models import Lab class APISubjectsTests(BaseTests): @@ -54,6 +56,15 @@ def setUp(self) -> None: timezone_mock.return_value = date status = Task.STATUS_DATA_SOURCES[i % len(Task.STATUS_DATA_SOURCES)][0] Task.objects.create(name=f'task_{i}', status=status) + # Create a session for testing signed-off filter + lab = Lab.objects.create(name='lab') + subject = Subject.objects.create(name='586', lab=lab) + json_data = {'sign_off_checklist': {'sign_off_date': datetime.today().isoformat()}} + self.session = Session.objects.create( + subject=subject, number=1, json=json_data, type='Experiment') + t = Task.objects.first() + t.session = self.session + t.save() def test_cleanup(self): """Test for cleanup action.""" @@ -73,6 +84,11 @@ def test_cleanup(self): self.assertEqual(n, Task.objects.count()) self.assertEqual(0, Task.objects.filter(datetime__date__lte=before_date).count()) + # With signed-off filter + assert (n := self.session.tasks.count()) > 0 + self.command.handle(action='cleanup', signed_off=True) + self.assertEqual(0, self.session.tasks.count()) + # With status filter as int n = Task.objects.count() - Task.objects.filter(status=20).count() self.command.handle(action='cleanup', status='20') From 78d347e3ac8cca7d6176d57539292ad02c2e9af7 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Fri, 15 Nov 2024 15:24:53 +0200 Subject: [PATCH 12/13] Minor test improvements --- alyx/data/tests.py | 10 +++++----- alyx/jobs/tests.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/alyx/data/tests.py b/alyx/data/tests.py index 5f4cced0..27e58388 100644 --- a/alyx/data/tests.py +++ b/alyx/data/tests.py @@ -253,14 +253,14 @@ def setUp(self): for i, repo in enumerate((repo1, repo2, repo3, repo_main)): if repo.globus_is_personal is False: rel_path = f'{session}/{TestManagementFiles._dataset_uuid_name(d)}' - if i == 0: - rel_path = 'Data2/' + rel_path - if i == 1: - rel_path = '/' + rel_path elif repo.lab != d.session.lab: continue # Don't create file record for dataset if session lab different else: rel_path = f'{session}/{d.name}' + if i == 0: + rel_path = 'Data2/' + rel_path + if i == 1: + rel_path = '/' + rel_path self.records.append( FileRecord.objects.create( relative_path=rel_path, exists=True, dataset=d, data_repository=repo) @@ -280,7 +280,7 @@ def test_get_name_collection_revision(self): expected = { 'lab': '', 'subject': 'subject', 'date': '2020-01-01', 'number': '001', 'collection': '', 'revision': '', 'filename': 'ephysData.raw.ap.bin', - 'full_path': 'subject/2020-01-01/001/ephysData.raw.ap.bin', + 'full_path': 'Data2/subject/2020-01-01/001/ephysData.raw.ap.bin', 'rel_dir_path': 'subject/2020-01-01/001'} self.assertDictEqual(info, expected) relative_path = relative_path.parent / 'alf' / '#2020-10-01#' / relative_path.name diff --git a/alyx/jobs/tests.py b/alyx/jobs/tests.py index a2145c0a..e8900bd3 100644 --- a/alyx/jobs/tests.py +++ b/alyx/jobs/tests.py @@ -119,5 +119,5 @@ def test_cleanup(self): # Error handling self.assertRaises(ValueError, self.command.handle, action='cleanup', status='NotAStatus') - self.assertRaises(ValueError, self.command.handle, action='cleanup', status='-1000') + self.assertRaises(ValueError, self.command.handle, action='cleanup', status='1000') self.assertRaises(ValueError, self.command.handle, action='NotAnAction') From f3f1ee31ded253fea19792ee43b74245383e6a71 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Fri, 15 Nov 2024 17:33:06 +0200 Subject: [PATCH 13/13] Cache generation tests --- alyx/experiments/tests_rest.py | 2 +- alyx/jobs/tests.py | 2 +- alyx/misc/management/commands/one_cache.py | 2 +- alyx/misc/tests.py | 64 ++++++++++++++++++++++ 4 files changed, 67 insertions(+), 3 deletions(-) diff --git a/alyx/experiments/tests_rest.py b/alyx/experiments/tests_rest.py index eac771f4..f7e2dc9f 100644 --- a/alyx/experiments/tests_rest.py +++ b/alyx/experiments/tests_rest.py @@ -407,7 +407,7 @@ def setUp(self): self.client.login(username='test', password='test') # self.session = Session.objects.first() lab = Lab.objects.create(name='lab') - subject = Subject.objects.create(name='586', lab=lab) + subject = Subject.objects.create(nickname='586', lab=lab) self.session = Session.objects.create(subject=subject, number=1) # need to add imaging procedure self.session.procedures.add(ProcedureType.objects.get_or_create(name='Imaging')[0]) diff --git a/alyx/jobs/tests.py b/alyx/jobs/tests.py index e8900bd3..247c84cd 100644 --- a/alyx/jobs/tests.py +++ b/alyx/jobs/tests.py @@ -58,7 +58,7 @@ def setUp(self) -> None: Task.objects.create(name=f'task_{i}', status=status) # Create a session for testing signed-off filter lab = Lab.objects.create(name='lab') - subject = Subject.objects.create(name='586', lab=lab) + subject = Subject.objects.create(nickname='586', lab=lab) json_data = {'sign_off_checklist': {'sign_off_date': datetime.today().isoformat()}} self.session = Session.objects.create( subject=subject, number=1, json=json_data, type='Experiment') diff --git a/alyx/misc/management/commands/one_cache.py b/alyx/misc/management/commands/one_cache.py index 3e73aa9b..cce20f19 100644 --- a/alyx/misc/management/commands/one_cache.py +++ b/alyx/misc/management/commands/one_cache.py @@ -418,7 +418,7 @@ def generate_datasets_frame(tags=None, batch_size=100_000) -> pd.DataFrame: df = (pd.DataFrame .from_records(current_qs.values(*fields)) .rename(fields_map, axis=1) - .astype({'id': str, 'eid': str, 'file_size': np.uint64})) + .astype({'id': str, 'eid': str, 'file_size': 'UInt64'})) df['exists'] = True # relative_path diff --git a/alyx/misc/tests.py b/alyx/misc/tests.py index 33ba0b80..25470485 100644 --- a/alyx/misc/tests.py +++ b/alyx/misc/tests.py @@ -1,9 +1,17 @@ +import zipfile +from pathlib import Path from datetime import datetime, timedelta +import tempfile import unittest from django.test import TestCase +from one.alf.spec import QC +from one.alf.cache import DATASETS_COLUMNS, SESSIONS_COLUMNS +import pandas as pd from subjects.models import Subject from misc.models import Housing, HousingSubject, CageType, LabMember, Lab +from actions.models import Session +from data.models import Dataset, DatasetType, DataRepository, FileRecord, DataFormat SKIP_ONE_CACHE = False try: @@ -145,6 +153,62 @@ def test_move_subject(self): @unittest.skipIf(SKIP_ONE_CACHE, 'Missing dependencies') class ONECache(TestCase): """Tests for misc.management.commands.one_cache""" + fixtures = [ + 'data.datarepositorytype.json', 'data.datasettype.json', + 'data.dataformat.json', 'misc.lab.json' + ] + + def setUp(self): + self.command = one_cache.Command() + tmp = tempfile.TemporaryDirectory() + self.addCleanup(tmp.cleanup) + self.tmp = Path(tmp.name) + # Create some sessions and datasets + lab = Lab.objects.first() + subject = Subject.objects.create(nickname='586', lab=lab) + repo = DataRepository.objects.create( + name='flatiron', globus_path='foo', lab=lab, globus_is_personal=True) + for i in range(5): + session = Session.objects.create( + subject=subject, number=i + 1, type='Experiment', task_protocol='foo', qc=QC.PASS) + for d in ('foo.bar.npy', 'bar.baz.bin'): + dtype, _ = DatasetType.objects.get_or_create(name=Path(d).stem) + format = DataFormat.objects.get(name=Path(d).suffix[1:]) + dataset = Dataset.objects.create( + session=session, dataset_type=dtype, collection='alf', qc=QC.PASS, + name=d, data_format=format, file_size=(1024 * i) or None) + p = (f'{session.subject.nickname}/{session.start_time.date()}' + f'/{session.number:03d}/alf/{d}') + FileRecord.objects.create( + relative_path=p, dataset=dataset, data_repository=repo, exists=True) + + def test_generate_tables(self): + """Test ONE cache table generation.""" + # Check table name validation + self.assertRaises(ValueError, self.command.handle, verbosity=1, tables=('foo',)) + # Check table generation + self.command.handle( + destination=str(self.tmp), compress=False, verbosity=1, + tables=('sessions', 'datasets') + ) + self.assertCountEqual( + ['date_created', 'origin', 'min_api_version'], self.command.metadata) + tables = sorted(self.tmp.glob('*.pqt')) + self.assertEqual(len(tables), 2) + datasets, sessions = pd.read_parquet(tables[0]), pd.read_parquet(tables[1]) + self.assertCountEqual( + datasets.reset_index().columns, DATASETS_COLUMNS + ('default_revision',)) + self.assertTrue(all(datasets['rel_path'].str.startswith('alf/'))) + self.assertCountEqual(sessions.reset_index().columns, SESSIONS_COLUMNS) + # Test QC and compression + self.command.handle( + destination=str(self.tmp), compress=True, verbosity=1, tables=('sessions',), qc=True) + zip_file = self.tmp / 'cache.zip' + self.assertTrue(zip_file.exists()) + cache_info = self.tmp / 'cache_info.json' + self.assertTrue(cache_info.exists()) + zip = zipfile.ZipFile(zip_file) + self.assertCountEqual(['sessions.pqt', 'cache_info.json', 'QC.json'], zip.namelist()) def test_s3_filesystem(self): """Test the _s3_filesystem function"""