Enforce data frame dtypes

int-brain-lab · Nov 3, 2024 · c8da662 · c8da662
1 parent 6ee3dac
commit c8da662
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,10 +7,12 @@ This version drops support for python 3.9 and below, and ONE is now in remote mo
 - supports python >= 3.10 only
 - OneAlyx uses remote mode by default, instead of auto
 - OneAlyx.search now updates the cache tables in remote mode as paginated sessions are accessed
+- Datasets table file_size column nullable by default
 
 ### Added
 
 - one.alf.cache.remove_cache_table_files and One.\_remove_cache_table_files for deleting cache table files
+- one.alf.cache.EMPTY_DATASETS_FRAME and EMPTY_SESSION_FRAME vars for table column, index, and dtype template
 
 ## [2.11.0]
 This version deprecates one.alf.files in preperation for replacing with one.alf.path in version 3.

diff --git a/one/api.py b/one/api.py
@@ -310,30 +310,33 @@ def _update_cache_from_records(self, strict=False, **kwargs):
                 raise KeyError(f'Table "{table}" not in cache')
             if isinstance(records, pd.Series):
                 records = pd.DataFrame([records])
+                records.index.set_names(self._cache[table].index.names, inplace=True)
             if not strict:
                 # Deal with case where there are extra columns in the cache
-                extra_columns = set(self._cache[table].columns) - set(records.columns)
+                extra_columns = list(set(self._cache[table].columns) - set(records.columns))
+                # Convert these columns to nullable, if required
+                cache_columns = self._cache[table][extra_columns]
+                self._cache[table][extra_columns] = cache_columns.convert_dtypes()
                 column_ids = map(list(self._cache[table].columns).index, extra_columns)
                 for col, n in sorted(zip(extra_columns, column_ids), key=lambda x: x[1]):
-                    val = records.get('exists', True) if col.startswith('exists_') else np.nan
+                    dtype = self._cache[table][col].dtype
+                    nan = getattr(dtype, 'na_value', np.nan)
+                    val = records.get('exists', True) if col.startswith('exists_') else nan
                     records.insert(n, col, val)
                 # Drop any extra columns in the records that aren't in cache table
                 to_drop = set(records.columns) - set(self._cache[table].columns)
                 records.drop(to_drop, axis=1, inplace=True)
                 records = records.reindex(columns=self._cache[table].columns)
             assert set(self._cache[table].columns) == set(records.columns)
+            records = records.astype(self._cache[table].dtypes)
             # Update existing rows
             to_update = records.index.isin(self._cache[table].index)
             self._cache[table].loc[records.index[to_update], :] = records[to_update]
             # Assign new rows
             to_assign = records[~to_update]
-            if isinstance(self._cache[table].index, pd.MultiIndex) and not to_assign.empty:
-                # Concatenate and sort (no other way for non-unique index within MultiIndex)
-                frames = filter(lambda x: not x.empty, [self._cache[table], to_assign])
-                self._cache[table] = pd.concat(frames).sort_index()
-            else:
-                for index, record in to_assign.iterrows():
-                    self._cache[table].loc[index, :] = record[self._cache[table].columns].values
+            frames = [self._cache[table], to_assign]
+            # Concatenate and sort
+            self._cache[table] = pd.concat(frames).sort_index()
             updated = datetime.now()
         self._cache['_meta']['modified_time'] = updated
         return updated

diff --git a/one/converters.py b/one/converters.py
@@ -20,7 +20,7 @@
 from iblutil.util import Bunch, Listable, ensure_list
 
 from one.alf.spec import is_session_path, is_uuid_string
-from one.alf.cache import QC_TYPE, EMPTY_DATASETS_FRAME
+from one.alf.cache import EMPTY_DATASETS_FRAME
 from one.alf.path import (
     get_session_path, add_uuid_string, session_path_parts, get_alf_path, remove_uuid_string)
 
@@ -783,7 +783,8 @@ def _to_record(d):
         return session, EMPTY_DATASETS_FRAME.copy()
     records = map(_to_record, ses['data_dataset_session_related'])
     index = ['eid', 'id']
-    datasets = pd.DataFrame(records).set_index(index).sort_index().astype({'qc': QC_TYPE})
+    dtypes = EMPTY_DATASETS_FRAME.dtypes
+    datasets = pd.DataFrame(records).astype(dtypes).set_index(index).sort_index()
     return session, datasets
 
 
@@ -829,8 +830,7 @@ def datasets2records(datasets, additional=None) -> pd.DataFrame:
             rec[field] = d.get(field)
         records.append(rec)
 
-    index = ['eid', 'id']
     if not records:
-        keys = (*index, 'file_size', 'hash', 'session_path', 'rel_path', 'default_revision', 'qc')
-        return pd.DataFrame(columns=keys).set_index(index)
-    return pd.DataFrame(records).set_index(index).sort_index().astype({'qc': QC_TYPE})
+        return EMPTY_DATASETS_FRAME
+    index = EMPTY_DATASETS_FRAME.index.names
+    return pd.DataFrame(records).set_index(index).sort_index().astype(EMPTY_DATASETS_FRAME.dtypes)
diff --git a/one/tests/test_one.py b/one/tests/test_one.py
@@ -55,6 +55,7 @@
 from one.converters import datasets2records
 from one.alf import spec
 from one.alf.files import get_alf_path
+from one.alf.cache import EMPTY_DATASETS_FRAME, EMPTY_SESSIONS_FRAME
 from . import util
 from . import OFFLINE_ONLY, TEST_DB_1, TEST_DB_2  # 1 = TestAlyx; 2 = OpenAlyx
 
@@ -910,8 +911,28 @@ def test_save_cache(self):
             self.assertFalse(lock_file.exists(), 'failed to remove stale lock file')
             time_mock.sleep.assert_called()
 
+    def test_reset_cache(self):
+        """Test One._reset_cache method, namely that cache types are correct."""
+        # Assert cache dtypes are indeed what are expected
+        self.one._reset_cache()
+        self.assertCountEqual(['datasets', 'sessions', '_meta'], self.one._cache.keys())
+        self.assertTrue(self.one._cache.datasets.empty)
+        self.assertCountEqual(EMPTY_SESSIONS_FRAME.columns, self.one._cache.sessions.columns)
+        self.assertTrue(self.one._cache.sessions.empty)
+        self.assertCountEqual(EMPTY_DATASETS_FRAME.columns, self.one._cache.datasets.columns)
+        # Check sessions data frame types
+        sessions_types = EMPTY_SESSIONS_FRAME.reset_index().dtypes.to_dict()
+        s_types = self.one._cache.sessions.reset_index().dtypes.to_dict()
+        self.assertDictEqual(sessions_types, s_types)
+        # Check datasets data frame types
+        datasets_types = EMPTY_DATASETS_FRAME.reset_index().dtypes.to_dict()
+        d_types = self.one._cache.datasets.reset_index().dtypes.to_dict()
+        self.assertDictEqual(datasets_types, d_types)
+
     def test_update_cache_from_records(self):
         """Test One._update_cache_from_records"""
+        sessions_types = self.one._cache.sessions.reset_index().dtypes.to_dict()
+        datasets_types = self.one._cache.datasets.reset_index().dtypes.to_dict()
         # Update with single record (pandas.Series), one exists, one doesn't
         session = self.one._cache.sessions.iloc[0].squeeze()
         session.name = str(uuid4())  # New record
@@ -921,24 +942,33 @@ def test_update_cache_from_records(self):
         self.assertTrue(session.name in self.one._cache.sessions.index)
         updated, = dataset['exists'] == self.one._cache.datasets.loc[dataset.name, 'exists']
         self.assertTrue(updated)
+        # Check that the updated data frame has kept its original dtypes
+        types = self.one._cache.sessions.reset_index().dtypes.to_dict()
+        self.assertDictEqual(sessions_types, types)
+        types = self.one._cache.datasets.reset_index().dtypes.to_dict()
+        self.assertDictEqual(datasets_types, types)
 
         # Update a number of records
         datasets = self.one._cache.datasets.iloc[:3].copy()
         datasets.loc[:, 'exists'] = ~datasets.loc[:, 'exists']
         # Make one of the datasets a new record
         idx = datasets.index.values
         idx[-1] = (idx[-1][0], str(uuid4()))
-        datasets.index = pd.MultiIndex.from_tuples(idx)
+        datasets.index = pd.MultiIndex.from_tuples(idx, names=('eid', 'id'))
         self.one._update_cache_from_records(datasets=datasets)
         self.assertTrue(idx[-1] in self.one._cache.datasets.index)
         verifiable = self.one._cache.datasets.loc[datasets.index.values, 'exists']
         self.assertTrue(np.all(verifiable == datasets.loc[:, 'exists']))
+        # Check that the updated data frame has kept its original dtypes
+        types = self.one._cache.datasets.reset_index().dtypes.to_dict()
+        self.assertDictEqual(datasets_types, types)
 
         # Check behaviour when columns don't match
         datasets.loc[:, 'exists'] = ~datasets.loc[:, 'exists']
         datasets['extra_column'] = True
         self.one._cache.datasets['foo_bar'] = 12  # this column is missing in our new records
         self.one._cache.datasets['new_column'] = False
+        expected_datasets_types = self.one._cache.datasets.reset_index().dtypes.to_dict()
         self.addCleanup(self.one._cache.datasets.drop, 'foo_bar', axis=1, inplace=True)
         # An exception is exists_* as the Alyx cache contains exists_aws and exists_flatiron
         # These should simply be filled with the values of exists as Alyx won't return datasets
@@ -955,18 +985,27 @@ def test_update_cache_from_records(self):
         # If the extra column does not start with 'exists' it should be set to NaN
         verifiable = self.one._cache.datasets.loc[datasets.index.values, 'foo_bar']
         self.assertTrue(np.isnan(verifiable).all())
+        # Check that the missing columns were updated to nullable fields
+        expected_datasets_types.update(
+            foo_bar=pd.Int64Dtype(), exists_aws=pd.BooleanDtype(), new_column=pd.BooleanDtype())
+        types = self.one._cache.datasets.reset_index().dtypes.to_dict()
+        self.assertDictEqual(expected_datasets_types, types)
 
         # Check fringe cases
         with self.assertRaises(KeyError):
             self.one._update_cache_from_records(unknown=datasets)
         self.assertIsNone(self.one._update_cache_from_records(datasets=None))
         # Absent cache table
         self.one.load_cache(tables_dir='/foo')
+        sessions_types = self.one._cache.sessions.reset_index().dtypes.to_dict()
+        datasets_types = self.one._cache.datasets.reset_index().dtypes.to_dict()
         self.one._update_cache_from_records(sessions=session, datasets=dataset)
         self.assertTrue(all(self.one._cache.sessions == pd.DataFrame([session])))
         self.assertEqual(1, len(self.one._cache.datasets))
         self.assertEqual(self.one._cache.datasets.squeeze().name, dataset.name)
         self.assertCountEqual(self.one._cache.datasets.squeeze().to_dict(), dataset.to_dict())
+        types = self.one._cache.datasets.reset_index().dtypes.to_dict()
+        self.assertDictEqual(datasets_types, types)
 
     def test_save_loaded_ids(self):
         """Test One.save_loaded_ids and logic within One._check_filesystem"""
@@ -1334,7 +1373,7 @@ def test_list_aggregates(self):
         # specifically /public in FI openalyx file rec
         mock_ds = [
             {'url': '3ef042c6-82a4-426c-9aa9-be3b48d86652',
-             'session': None, 'file_size': '', 'hash': '',
+             'session': None, 'file_size': None, 'hash': '',
              'file_records': [{'data_url': 'https://ibl-brain-wide-map-public.s3.amazonaws.com/'
                                            'aggregates/Subjects/cortexlab/KS051/'
                                            'trials.table.3ef042c6-82a4-426c-9aa9-be3b48d86652.pqt',
@@ -1343,7 +1382,7 @@ def test_list_aggregates(self):
              'default_dataset': True,
              'qc': 'NOT_SET'},
             {'url': '7bdb08d6-b166-43d8-8981-816cf616d291',
-             'session': None, 'file_size': '', 'hash': '',
+             'session': None, 'file_size': None, 'hash': '',
              'file_records': [{'data_url': 'https://ibl.flatironinstitute.org/'
                                            'public/aggregates/Subjects/mrsicflogellab/IBL_005/'
                                            'trials.table.7bdb08d6-b166-43d8-8981-816cf616d291.pqt',
@@ -1353,7 +1392,11 @@ def test_list_aggregates(self):
              'qc': 'NOT_SET'},
         ]
         with mock.patch.object(self.one.alyx, 'rest', return_value=mock_ds):
-            self.assertEqual(len(self.one.list_aggregates('subjects')), 2)
+            dsets = self.one.list_aggregates('subjects')
+            self.assertEqual(len(dsets), 2)
+            # Should handle null file_size values correctly
+            self.assertIsInstance(dsets.file_size.dtype, pd.UInt64Dtype)
+            self.assertTrue(dsets.file_size.isna().all())
 
     def test_load_aggregate(self):
         """Test OneAlyx.load_aggregate"""