diff --git a/renumics/spotlight/backend/exceptions.py b/renumics/spotlight/backend/exceptions.py index cb8ee678..6bbfa155 100644 --- a/renumics/spotlight/backend/exceptions.py +++ b/renumics/spotlight/backend/exceptions.py @@ -169,3 +169,15 @@ def __init__(self) -> None: "Filebrowsing is not allowed.", status.HTTP_403_FORBIDDEN, ) + + +class H5DatasetOutdated(Problem): + """H5 Dataset is outdated""" + + def __init__(self) -> None: + super().__init__( + "H5 Dataset outdated", + "Only new-style string H5 references supported. Update your " + "dataset using `dataset.rebuild()`.", + status.HTTP_500_INTERNAL_SERVER_ERROR, + ) diff --git a/renumics/spotlight/dataset/__init__.py b/renumics/spotlight/dataset/__init__.py index e586499d..195d5153 100644 --- a/renumics/spotlight/dataset/__init__.py +++ b/renumics/spotlight/dataset/__init__.py @@ -31,8 +31,9 @@ from loguru import logger from typing_extensions import TypeGuard +from renumics.spotlight import dtypes as spotlight_dtypes from renumics.spotlight.__version__ import __version__ -from .pandas import create_typed_series, infer_dtypes, is_string_mask, prepare_column +from renumics.spotlight.dtypes.conversion import prepare_path_or_url from renumics.spotlight.typing import ( BoolType, IndexType, @@ -42,10 +43,8 @@ is_integer, is_iterable, ) -from renumics.spotlight.dtypes.conversion import prepare_path_or_url -from renumics.spotlight import dtypes as spotlight_dtypes - from . import exceptions +from .pandas import create_typed_series, infer_dtypes, is_string_mask, prepare_column from .typing import ( OutputType, ExternalOutputType, @@ -1908,6 +1907,49 @@ def rename_column(self, old_name: str, new_name: str) -> None: self._column_names.add(new_name) self._update_generation_id() + def rebuild(self) -> None: + """ + Update old-style columns in the dataset. + Be aware, that it can take some time and memory. It is useful to do + `prune` after `rebuild`. + """ + self._assert_is_writable() + + old_columns = [] + for name in self._column_names: + h5_dataset: h5py.Dataset = self._h5_file[name] + dtype = self._get_dtype(h5_dataset) + if spotlight_dtypes.is_embedding_dtype(dtype): + vlen_dtype = h5py.check_vlen_dtype(h5_dataset.dtype) + if ( + vlen_dtype is None + or not isinstance(vlen_dtype, np.dtype) + or vlen_dtype.kind not in "fiu" + ): + # Non-vlen embedding columns + old_columns.append(name) + elif ( + spotlight_dtypes.is_array_dtype(dtype) + or spotlight_dtypes.is_sequence_1d_dtype(dtype) + or spotlight_dtypes.is_filebased_dtype(dtype) + ) and h5py.check_string_dtype(h5_dataset.dtype) is None: + # Non-string complex dtype columns + old_columns.append(name) + + for name in old_columns: + new_name = name + while new_name in self._column_names: + new_name += "_" + self.append_column( + new_name, + self.get_dtype(name), + self[name], + **self.get_column_attributes(name), + ) + del self[name] + self.rename_column(new_name, name) + logger.info(f"Column {name} rebuilt") + def prune(self) -> None: """ Rebuild the whole dataset with the same content. diff --git a/renumics/spotlight_plugins/core/hdf5_data_source.py b/renumics/spotlight_plugins/core/hdf5_data_source.py index 1e277155..05e149c5 100644 --- a/renumics/spotlight_plugins/core/hdf5_data_source.py +++ b/renumics/spotlight_plugins/core/hdf5_data_source.py @@ -12,6 +12,7 @@ from renumics.spotlight.data_source import DataSource, datasource from renumics.spotlight.backend.exceptions import ( + H5DatasetOutdated, NoTableFileFound, CouldNotOpenTableFile, ) @@ -49,6 +50,8 @@ def read_column( raw_values = np.array([x.decode("utf-8") for x in raw_values]) if self._is_ref_column(column): + if not is_string_dtype: + raise H5DatasetOutdated() assert is_string_dtype, "Only new-style string h5 references supported." normalized_values = np.empty(len(raw_values), dtype=object) normalized_values[:] = [