Skip to content

Commit

Permalink
Merge pull request #320 from Renumics/feature/230-rebuild-old-style-h…
Browse files Browse the repository at this point in the history
…5-datasets

Feature/230 rebuild old style h5 datasets
  • Loading branch information
druzsan authored Oct 27, 2023
2 parents d6e9c10 + 867f3e1 commit d483207
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 4 deletions.
12 changes: 12 additions & 0 deletions renumics/spotlight/backend/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,15 @@ def __init__(self) -> None:
"Filebrowsing is not allowed.",
status.HTTP_403_FORBIDDEN,
)


class H5DatasetOutdated(Problem):
"""H5 Dataset is outdated"""

def __init__(self) -> None:
super().__init__(
"H5 Dataset outdated",
"Only new-style string H5 references supported. Update your "
"dataset using `dataset.rebuild()`.",
status.HTTP_500_INTERNAL_SERVER_ERROR,
)
50 changes: 46 additions & 4 deletions renumics/spotlight/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@
from loguru import logger
from typing_extensions import TypeGuard

from renumics.spotlight import dtypes as spotlight_dtypes
from renumics.spotlight.__version__ import __version__
from .pandas import create_typed_series, infer_dtypes, is_string_mask, prepare_column
from renumics.spotlight.dtypes.conversion import prepare_path_or_url
from renumics.spotlight.typing import (
BoolType,
IndexType,
Expand All @@ -42,10 +43,8 @@
is_integer,
is_iterable,
)
from renumics.spotlight.dtypes.conversion import prepare_path_or_url
from renumics.spotlight import dtypes as spotlight_dtypes

from . import exceptions
from .pandas import create_typed_series, infer_dtypes, is_string_mask, prepare_column
from .typing import (
OutputType,
ExternalOutputType,
Expand Down Expand Up @@ -1908,6 +1907,49 @@ def rename_column(self, old_name: str, new_name: str) -> None:
self._column_names.add(new_name)
self._update_generation_id()

def rebuild(self) -> None:
"""
Update old-style columns in the dataset.
Be aware, that it can take some time and memory. It is useful to do
`prune` after `rebuild`.
"""
self._assert_is_writable()

old_columns = []
for name in self._column_names:
h5_dataset: h5py.Dataset = self._h5_file[name]
dtype = self._get_dtype(h5_dataset)
if spotlight_dtypes.is_embedding_dtype(dtype):
vlen_dtype = h5py.check_vlen_dtype(h5_dataset.dtype)
if (
vlen_dtype is None
or not isinstance(vlen_dtype, np.dtype)
or vlen_dtype.kind not in "fiu"
):
# Non-vlen embedding columns
old_columns.append(name)
elif (
spotlight_dtypes.is_array_dtype(dtype)
or spotlight_dtypes.is_sequence_1d_dtype(dtype)
or spotlight_dtypes.is_filebased_dtype(dtype)
) and h5py.check_string_dtype(h5_dataset.dtype) is None:
# Non-string complex dtype columns
old_columns.append(name)

for name in old_columns:
new_name = name
while new_name in self._column_names:
new_name += "_"
self.append_column(
new_name,
self.get_dtype(name),
self[name],
**self.get_column_attributes(name),
)
del self[name]
self.rename_column(new_name, name)
logger.info(f"Column {name} rebuilt")

def prune(self) -> None:
"""
Rebuild the whole dataset with the same content.
Expand Down
3 changes: 3 additions & 0 deletions renumics/spotlight_plugins/core/hdf5_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from renumics.spotlight.data_source import DataSource, datasource
from renumics.spotlight.backend.exceptions import (
H5DatasetOutdated,
NoTableFileFound,
CouldNotOpenTableFile,
)
Expand Down Expand Up @@ -49,6 +50,8 @@ def read_column(
raw_values = np.array([x.decode("utf-8") for x in raw_values])

if self._is_ref_column(column):
if not is_string_dtype:
raise H5DatasetOutdated()
assert is_string_dtype, "Only new-style string h5 references supported."
normalized_values = np.empty(len(raw_values), dtype=object)
normalized_values[:] = [
Expand Down

0 comments on commit d483207

Please sign in to comment.