Merge pull request #320 from Renumics/feature/230-rebuild-old-style-h…

…5-datasets Feature/230 rebuild old style h5 datasets
Renumics · Oct 27, 2023 · d483207 · d483207
2 parents d6e9c10 + 867f3e1
commit d483207
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 4 deletions.
diff --git a/renumics/spotlight/backend/exceptions.py b/renumics/spotlight/backend/exceptions.py
@@ -169,3 +169,15 @@ def __init__(self) -> None:
             "Filebrowsing is not allowed.",
             status.HTTP_403_FORBIDDEN,
         )
+
+
+class H5DatasetOutdated(Problem):
+    """H5 Dataset is outdated"""
+
+    def __init__(self) -> None:
+        super().__init__(
+            "H5 Dataset outdated",
+            "Only new-style string H5 references supported. Update your "
+            "dataset using `dataset.rebuild()`.",
+            status.HTTP_500_INTERNAL_SERVER_ERROR,
+        )
diff --git a/renumics/spotlight/dataset/__init__.py b/renumics/spotlight/dataset/__init__.py
@@ -31,8 +31,9 @@
 from loguru import logger
 from typing_extensions import TypeGuard
 
+from renumics.spotlight import dtypes as spotlight_dtypes
 from renumics.spotlight.__version__ import __version__
-from .pandas import create_typed_series, infer_dtypes, is_string_mask, prepare_column
+from renumics.spotlight.dtypes.conversion import prepare_path_or_url
 from renumics.spotlight.typing import (
     BoolType,
     IndexType,
@@ -42,10 +43,8 @@
     is_integer,
     is_iterable,
 )
-from renumics.spotlight.dtypes.conversion import prepare_path_or_url
-from renumics.spotlight import dtypes as spotlight_dtypes
-
 from . import exceptions
+from .pandas import create_typed_series, infer_dtypes, is_string_mask, prepare_column
 from .typing import (
     OutputType,
     ExternalOutputType,
@@ -1908,6 +1907,49 @@ def rename_column(self, old_name: str, new_name: str) -> None:
         self._column_names.add(new_name)
         self._update_generation_id()
 
+    def rebuild(self) -> None:
+        """
+        Update old-style columns in the dataset.
+        Be aware, that it can take some time and memory. It is useful to do
+        `prune` after `rebuild`.
+        """
+        self._assert_is_writable()
+
+        old_columns = []
+        for name in self._column_names:
+            h5_dataset: h5py.Dataset = self._h5_file[name]
+            dtype = self._get_dtype(h5_dataset)
+            if spotlight_dtypes.is_embedding_dtype(dtype):
+                vlen_dtype = h5py.check_vlen_dtype(h5_dataset.dtype)
+                if (
+                    vlen_dtype is None
+                    or not isinstance(vlen_dtype, np.dtype)
+                    or vlen_dtype.kind not in "fiu"
+                ):
+                    # Non-vlen embedding columns
+                    old_columns.append(name)
+            elif (
+                spotlight_dtypes.is_array_dtype(dtype)
+                or spotlight_dtypes.is_sequence_1d_dtype(dtype)
+                or spotlight_dtypes.is_filebased_dtype(dtype)
+            ) and h5py.check_string_dtype(h5_dataset.dtype) is None:
+                # Non-string complex dtype columns
+                old_columns.append(name)
+
+        for name in old_columns:
+            new_name = name
+            while new_name in self._column_names:
+                new_name += "_"
+            self.append_column(
+                new_name,
+                self.get_dtype(name),
+                self[name],
+                **self.get_column_attributes(name),
+            )
+            del self[name]
+            self.rename_column(new_name, name)
+            logger.info(f"Column {name} rebuilt")
+
     def prune(self) -> None:
         """
         Rebuild the whole dataset with the same content.

diff --git a/renumics/spotlight_plugins/core/hdf5_data_source.py b/renumics/spotlight_plugins/core/hdf5_data_source.py
@@ -12,6 +12,7 @@
 
 from renumics.spotlight.data_source import DataSource, datasource
 from renumics.spotlight.backend.exceptions import (
+    H5DatasetOutdated,
     NoTableFileFound,
     CouldNotOpenTableFile,
 )
@@ -49,6 +50,8 @@ def read_column(
             raw_values = np.array([x.decode("utf-8") for x in raw_values])
 
         if self._is_ref_column(column):
+            if not is_string_dtype:
+                raise H5DatasetOutdated()
             assert is_string_dtype, "Only new-style string h5 references supported."
             normalized_values = np.empty(len(raw_values), dtype=object)
             normalized_values[:] = [