From 05630b4adba5f2a20c3b187206d1d7e881032d67 Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Fri, 22 Sep 2023 14:53:26 +0200 Subject: [PATCH 1/8] Enhance array and embedding dtypes, better guess HF dtypes from features --- .github/workflows/ci.yml | 2 +- renumics/spotlight/data_store.py | 48 ++++++----- renumics/spotlight/dtypes/__init__.py | 42 +++++++-- .../core/huggingface_datasource.py | 73 +++++++++------- scripts/create_hf_dataset.py | 85 +++++++++++++++++++ 5 files changed, 189 insertions(+), 61 deletions(-) create mode 100755 scripts/create_hf_dataset.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8c6ef304..691a85f0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -376,7 +376,7 @@ jobs: with: python-version: ${{ matrix.python-version }} install-dependencies: false - - name: Cache pip cache folder + - name: '♻️ Cache pip cache folder' uses: actions/cache@v3 with: path: ${{ steps.setup-poetry.outputs.pip-cache-dir }} diff --git a/renumics/spotlight/data_store.py b/renumics/spotlight/data_store.py index 7d70f9bc..90f6ed5c 100644 --- a/renumics/spotlight/data_store.py +++ b/renumics/spotlight/data_store.py @@ -14,9 +14,12 @@ from renumics.spotlight.data_source.data_source import ColumnMetadata from renumics.spotlight.io import audio from renumics.spotlight.dtypes import ( + ArrayDType, CategoryDType, DType, DTypeMap, + EmbeddingDType, + is_array_dtype, is_audio_dtype, is_category_dtype, is_file_dtype, @@ -29,7 +32,6 @@ video_dtype, mesh_dtype, embedding_dtype, - array_dtype, window_dtype, sequence_1d_dtype, ) @@ -163,7 +165,10 @@ def _update_dtypes(self) -> None: def _guess_dtype(self, col: str) -> DType: intermediate_dtype = self._data_source.intermediate_dtypes[col] - fallback_dtype = _intermediate_to_semantic_dtype(intermediate_dtype) + semantic_dtype = _intermediate_to_semantic_dtype(intermediate_dtype) + + if is_array_dtype(intermediate_dtype): + return semantic_dtype sample_values = self._data_source.get_column_values(col, slice(10)) sample_dtypes = [_guess_value_dtype(value) for value in sample_values] @@ -171,12 +176,26 @@ def _guess_dtype(self, col: str) -> DType: try: mode_dtype = statistics.mode(sample_dtypes) except statistics.StatisticsError: - return fallback_dtype + return semantic_dtype - return mode_dtype or fallback_dtype + return mode_dtype or semantic_dtype def _intermediate_to_semantic_dtype(intermediate_dtype: DType) -> DType: + if is_array_dtype(intermediate_dtype): + if intermediate_dtype.shape == (2,): + return window_dtype + if intermediate_dtype.ndim == 1 and intermediate_dtype.shape[0] is not None: + return EmbeddingDType(intermediate_dtype.shape[0]) + if intermediate_dtype.ndim == 1 and intermediate_dtype.shape[0] is None: + return sequence_1d_dtype + if intermediate_dtype.ndim == 2 and ( + intermediate_dtype.shape[0] == 2 or intermediate_dtype.shape[1] == 2 + ): + return sequence_1d_dtype + if intermediate_dtype.ndim == 3 and intermediate_dtype.shape[-1] in (1, 3, 4): + return image_dtype + return intermediate_dtype if is_file_dtype(intermediate_dtype): return str_dtype if is_mixed_dtype(intermediate_dtype): @@ -208,7 +227,7 @@ def _guess_value_dtype(value: Any) -> Optional[DType]: if isinstance(value, trimesh.Trimesh): return mesh_dtype if isinstance(value, np.ndarray): - return _infer_array_dtype(value) + return ArrayDType(value.shape) if isinstance(value, bytes) or (is_pathtype(value) and os.path.isfile(value)): kind = filetype.guess(value) @@ -227,22 +246,5 @@ def _guess_value_dtype(value: Any) -> Optional[DType]: except (TypeError, ValueError): pass else: - return _infer_array_dtype(value) + return ArrayDType(value.shape) return None - - -def _infer_array_dtype(value: np.ndarray) -> DType: - """ - Infer dtype of a numpy array - """ - if value.ndim == 3: - if value.shape[-1] in (1, 3, 4): - return image_dtype - elif value.ndim == 2: - if value.shape[0] == 2 or value.shape[1] == 2: - return sequence_1d_dtype - elif value.ndim == 1: - if len(value) == 2: - return window_dtype - return embedding_dtype - return array_dtype diff --git a/renumics/spotlight/dtypes/__init__.py b/renumics/spotlight/dtypes/__init__.py index 7348097c..0e24ea10 100644 --- a/renumics/spotlight/dtypes/__init__.py +++ b/renumics/spotlight/dtypes/__init__.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import Any, Dict, Iterable, Optional, Union +from typing import Any, Dict, Iterable, Optional, Tuple, Union import numpy as np from typing_extensions import TypeGuard @@ -80,6 +80,38 @@ def inverted_categories(self) -> Optional[Dict[int, str]]: return self._inverted_categories +class ArrayDType(DType): + """ + Array dtype with optional shape. + """ + + shape: Optional[Tuple[Optional[int], ...]] + + def __init__(self, shape: Optional[Tuple[Optional[int], ...]] = None): + super().__init__("array") + self.shape = shape + + @property + def ndim(self) -> int: + if self.shape is None: + return 0 + return len(self.shape) + + +class EmbeddingDType(DType): + """ + Embedding dtype with optional length. + """ + + length: Optional[int] + + def __init__(self, length: Optional[int] = None): + super().__init__("Embedding") + if length is not None and length < 0: + raise ValueError(f"Length must be non-negative, but {length} received.") + self.length = length + + class Sequence1DDType(DType): """ 1D-sequence dtype with predefined axis labels. @@ -131,10 +163,10 @@ def register_dtype(dtype: DType, aliases: Optional[list] = None) -> None: window_dtype = DType("Window") """Window dtype""" register_dtype(window_dtype, [Window]) -embedding_dtype = DType("Embedding") +embedding_dtype = EmbeddingDType() """Embedding dtype""" register_dtype(embedding_dtype, [Embedding]) -array_dtype = DType("array") +array_dtype = ArrayDType() """numpy array dtype""" register_dtype(array_dtype, [np.ndarray]) image_dtype = DType("Image") @@ -195,7 +227,7 @@ def is_category_dtype(dtype: DType) -> TypeGuard[CategoryDType]: return dtype.name == "Category" -def is_array_dtype(dtype: DType) -> bool: +def is_array_dtype(dtype: DType) -> TypeGuard[ArrayDType]: return dtype.name == "array" @@ -203,7 +235,7 @@ def is_window_dtype(dtype: DType) -> bool: return dtype.name == "Window" -def is_embedding_dtype(dtype: DType) -> bool: +def is_embedding_dtype(dtype: DType) -> TypeGuard[EmbeddingDType]: return dtype.name == "Embedding" diff --git a/renumics/spotlight_plugins/core/huggingface_datasource.py b/renumics/spotlight_plugins/core/huggingface_datasource.py index 49a50f63..40ad817b 100644 --- a/renumics/spotlight_plugins/core/huggingface_datasource.py +++ b/renumics/spotlight_plugins/core/huggingface_datasource.py @@ -2,18 +2,28 @@ import datasets import numpy as np -from renumics.spotlight import dtypes from renumics.spotlight.data_source import DataSource from renumics.spotlight.data_source.decorator import datasource from renumics.spotlight.dtypes import ( + ArrayDType, + CategoryDType, DType, DTypeMap, + audio_dtype, + bool_dtype, + datetime_dtype, + float_dtype, + image_dtype, + int_dtype, + file_dtype, + bytes_dtype, is_array_dtype, is_embedding_dtype, is_file_dtype, is_float_dtype, is_int_dtype, + str_dtype, ) from renumics.spotlight.data_source.data_source import ColumnMetadata @@ -84,7 +94,7 @@ def get_uid(self) -> str: return self._dataset._fingerprint def get_name(self) -> str: - return self._dataset.builder_name + return f"🤗 Dataset {self._dataset.builder_name or ''}" def get_column_values( self, @@ -141,13 +151,9 @@ def get_column_metadata(self, _: str) -> ColumnMetadata: def _guess_semantic_dtype(feature: _FeatureType) -> Optional[DType]: if isinstance(feature, datasets.Audio): - return dtypes.audio_dtype + return audio_dtype if isinstance(feature, datasets.Image): - return dtypes.image_dtype - if isinstance(feature, datasets.Sequence): - if isinstance(feature.feature, datasets.Value): - if feature.length != -1: - return dtypes.embedding_dtype + return image_dtype return None @@ -155,55 +161,58 @@ def _get_intermediate_dtype(feature: _FeatureType) -> DType: if isinstance(feature, datasets.Value): hf_dtype = cast(datasets.Value, feature).dtype if hf_dtype == "bool": - return dtypes.bool_dtype + return bool_dtype elif hf_dtype.startswith("int"): - return dtypes.int_dtype + return int_dtype elif hf_dtype.startswith("uint"): - return dtypes.int_dtype + return int_dtype elif hf_dtype.startswith("float"): - return dtypes.float_dtype + return float_dtype elif hf_dtype.startswith("time32"): - return dtypes.datetime_dtype + return datetime_dtype elif hf_dtype.startswith("time64"): - return dtypes.datetime_dtype + return datetime_dtype elif hf_dtype.startswith("timestamp"): - return dtypes.datetime_dtype + return datetime_dtype elif hf_dtype.startswith("date32"): - return dtypes.datetime_dtype + return datetime_dtype elif hf_dtype.startswith("date64"): - return dtypes.datetime_dtype + return datetime_dtype elif hf_dtype.startswith("duration"): - return dtypes.float_dtype + return float_dtype elif hf_dtype.startswith("decimal"): - return dtypes.float_dtype + return float_dtype elif hf_dtype == "binary": - return dtypes.bytes_dtype + return bytes_dtype elif hf_dtype == "large_binary": - return dtypes.bytes_dtype + return bytes_dtype elif hf_dtype == "string": - return dtypes.str_dtype + return str_dtype elif hf_dtype == "large_string": - return dtypes.str_dtype + return str_dtype else: raise UnsupportedFeature(feature) elif isinstance(feature, datasets.ClassLabel): - return dtypes.CategoryDType(categories=cast(datasets.ClassLabel, feature).names) + return CategoryDType(categories=cast(datasets.ClassLabel, feature).names) elif isinstance(feature, datasets.Audio): - return dtypes.file_dtype + return file_dtype elif isinstance(feature, datasets.Image): - return dtypes.file_dtype + return file_dtype elif isinstance(feature, datasets.Sequence): inner_dtype = _get_intermediate_dtype(feature.feature) if is_int_dtype(inner_dtype) or is_float_dtype(inner_dtype): - return dtypes.array_dtype - else: - return dtypes.str_dtype + return ArrayDType((None if feature.length == -1 else feature.length,)) + if is_array_dtype(inner_dtype): + return ArrayDType( + (None if feature.length == -1 else feature.length, *inner_dtype.shape) + ) + return str_dtype elif isinstance(feature, dict): if len(feature) == 2 and "bytes" in feature and "path" in feature: - return dtypes.file_dtype + return file_dtype else: - return dtypes.str_dtype + return str_dtype elif isinstance(feature, datasets.Translation): - return dtypes.str_dtype + return str_dtype else: raise UnsupportedFeature(feature) diff --git a/scripts/create_hf_dataset.py b/scripts/create_hf_dataset.py new file mode 100755 index 00000000..fdcb77fa --- /dev/null +++ b/scripts/create_hf_dataset.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +""" +This script creates multimodal Hugging Face dataset to test Spotlight on. +""" + +import datasets +import numpy as np + +from renumics.spotlight import dtypes + + +def random_values( + dtype: dtypes.DType, num_rows: int, optional: bool = False +) -> np.ndarray: + if dtypes.is_bool_dtype(dtype): + values = np.random.randint(0, 2, num_rows, bool) + elif dtypes.is_int_dtype(dtype): + values = np.random.randint(0, 2, num_rows, bool) + elif dtypes.is_float_dtype(dtype): + values = np.random.normal(0, 100, num_rows) + # elif dtypes.is_str_dtype(dtype): + # str_lengths = np.random.randint(0, 100, num_rows) + # null_indices = np.random.randint(0, num_rows, num_rows // 10) + # str_lengths[null_indices] = 0 + # all_letters = np.array( + # list(string.ascii_letters + string.digits + string.punctuation) + # ) + else: + raise NotImplementedError + + if not optional: + return values + + null_indices = np.random.randint(0, num_rows, num_rows // 10) + if np.issubdtype(values.dtype, np.floating): + values[null_indices] = np.nan + else: + values = values.astype(object) + values[null_indices] = None + return values + + +def create_hf_dataset(num_rows: int) -> None: + ds = datasets.Dataset.from_dict( + { + # "bool": random_values(dtypes.bool_dtype, num_rows), + # "int": random_values(dtypes.int_dtype, num_rows), + # "float": random_values(dtypes.float_dtype, num_rows), + "embedding": [[1, 2, 3, 4], [1, 6, 3, 7], [-1, -2, -3, -4]], + "sequence_1d": [[1, 2, 3, 4], [1, 6, 3], [-1, -2, -3, -4, 10]], + "sequence_2d": [ + [[1, 2, 3, 4], [-1, 3, 1, 6]], + [[1, -3, 10], [1, 6, 3]], + [[-10, 0, 10], [-1, -2, -3]], + ], + "sequence_2d_t": [[[5, 3], [2, 5], [10, 8]], [], [[-1, 1], [1, 10]]], + }, + features=datasets.Features( + { + "embedding": datasets.Sequence( + feature=datasets.Value("float64"), length=4 + ), + "sequence_1d": datasets.Sequence(feature=datasets.Value("float64")), + "sequence_2d": datasets.Sequence( + feature=datasets.Sequence(feature=datasets.Value("float64")), + length=2, + ), + "sequence_2d_t": datasets.Sequence( + feature=datasets.Sequence( + feature=datasets.Value("float64"), length=2 + ), + ), + } + ), + # info=datasets.DatasetInfo(), + # split=datasets.NamedSplit, + ) + ds.save_to_disk("build/datasets/hf") + print(ds.features) + + +if __name__ == "__main__": + np.random.seed(42) + create_hf_dataset(100) From 498fc48e37914e8545742f71fa16ad6fd66fe7c7 Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Mon, 25 Sep 2023 11:50:47 +0200 Subject: [PATCH 2/8] Support HF sequences, arrays and lists, interpret them as sequences, arrays if possible --- .../core/huggingface_datasource.py | 49 +++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/renumics/spotlight_plugins/core/huggingface_datasource.py b/renumics/spotlight_plugins/core/huggingface_datasource.py index 40ad817b..0206160c 100644 --- a/renumics/spotlight_plugins/core/huggingface_datasource.py +++ b/renumics/spotlight_plugins/core/huggingface_datasource.py @@ -135,11 +135,29 @@ def get_column_values( if isinstance(feature, datasets.Sequence): if is_array_dtype(intermediate_dtype): - return raw_values.to_numpy() + values = [ + _convert_object_array(value) for value in raw_values.to_numpy() + ] + return_array = np.empty(len(values), dtype=object) + return_array[:] = values + return return_array if is_embedding_dtype(intermediate_dtype): return raw_values.to_numpy() return np.array([str(value) for value in raw_values]) + if isinstance( + feature, + (datasets.Array2D, datasets.Array3D, datasets.Array4D, datasets.Array5D), + ): + if is_array_dtype(intermediate_dtype): + values = [ + _convert_object_array(value) for value in raw_values.to_numpy() + ] + return_array = np.empty(len(values), dtype=object) + return_array[:] = values + return return_array + return np.array([str(value) for value in raw_values]) + if isinstance(feature, datasets.Translation): return np.array([str(value) for value in raw_values]) @@ -203,10 +221,29 @@ def _get_intermediate_dtype(feature: _FeatureType) -> DType: if is_int_dtype(inner_dtype) or is_float_dtype(inner_dtype): return ArrayDType((None if feature.length == -1 else feature.length,)) if is_array_dtype(inner_dtype): - return ArrayDType( - (None if feature.length == -1 else feature.length, *inner_dtype.shape) + shape = ( + None if feature.length == -1 else feature.length, + *inner_dtype.shape, ) + if shape.count(None) > 1: + return str_dtype + return ArrayDType(shape) return str_dtype + elif isinstance(feature, list): + inner_dtype = _get_intermediate_dtype(feature[0]) + if is_int_dtype(inner_dtype) or is_float_dtype(inner_dtype): + return ArrayDType((None,)) + if is_array_dtype(inner_dtype): + shape = (None, *inner_dtype.shape) + if shape.count(None) > 1: + return str_dtype + return ArrayDType(shape) + return str_dtype + elif isinstance( + feature, + (datasets.Array2D, datasets.Array3D, datasets.Array4D, datasets.Array5D), + ): + return ArrayDType(feature.shape) elif isinstance(feature, dict): if len(feature) == 2 and "bytes" in feature and "path" in feature: return file_dtype @@ -216,3 +253,9 @@ def _get_intermediate_dtype(feature: _FeatureType) -> DType: return str_dtype else: raise UnsupportedFeature(feature) + + +def _convert_object_array(value: np.ndarray) -> np.ndarray: + if value.dtype.type is np.object_: + return np.array([_convert_object_array(x) for x in value]) + return value From e90ba76fc7f336ff551ea98e9d9053c23ee23925 Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Mon, 25 Sep 2023 11:51:21 +0200 Subject: [PATCH 3/8] Test more HF dtypes --- scripts/create_hf_dataset.py | 82 ++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/scripts/create_hf_dataset.py b/scripts/create_hf_dataset.py index fdcb77fa..db3006fa 100755 --- a/scripts/create_hf_dataset.py +++ b/scripts/create_hf_dataset.py @@ -6,6 +6,7 @@ import datasets import numpy as np +from renumics import spotlight from renumics.spotlight import dtypes @@ -44,20 +45,74 @@ def random_values( def create_hf_dataset(num_rows: int) -> None: ds = datasets.Dataset.from_dict( { - # "bool": random_values(dtypes.bool_dtype, num_rows), + "bool": [True, False, False], + "int": [-1, 1, 100000], + "uint": [1, 1, 30000], + "float": [1.0, float("nan"), 1000], + "string": ["foo", "barbaz", ""], + "label": ["foo", "bar", "foo"], # "int": random_values(dtypes.int_dtype, num_rows), # "float": random_values(dtypes.float_dtype, num_rows), "embedding": [[1, 2, 3, 4], [1, 6, 3, 7], [-1, -2, -3, -4]], - "sequence_1d": [[1, 2, 3, 4], [1, 6, 3], [-1, -2, -3, -4, 10]], + # HF sequence as Spotlight sequence + "sequence_1d": [[1, 2, 3, 4], [1, 6, 3], [-1, -2, float("nan"), -4, 10]], "sequence_2d": [ [[1, 2, 3, 4], [-1, 3, 1, 6]], [[1, -3, 10], [1, 6, 3]], [[-10, 0, 10], [-1, -2, -3]], ], "sequence_2d_t": [[[5, 3], [2, 5], [10, 8]], [], [[-1, 1], [1, 10]]], + # HF sequence as Spotlight array + "sequence_2d_array": [ + [[1, 2, 3, 4], [-1, 3, 1, 6], [1, 2, 4, 4]], + [[1, -3, 10], [1, 6, 3], [1, float("nan"), 4]], + [[-10, 0, 10], [-1, -2, -3], [1, 2, 4]], + ], + "sequence_3d_array": [ + [[[1, 2, 3, 4], [-1, 3, 1, 6], [1, 2, 4, 4]]], + [[[1, -3, 10], [1, 6, 3], [1, float("nan"), 4]]], + [[[-10, 0, 10], [-1, -2, -3], [1, 2, 4]]], + ], + # HF 2D array as Spotlight sequence + "array_2d_sequence": [ + [[1, 2, 3], [-1, 3, 1]], + [[1, -3, 10], [1, 6, 3]], + [[-10, 0, 10], [-1, -2, -3]], + ], + "array_2d_t_sequence": [ + [[5, 3], [2, 5], [10, 8]], + [[float("nan"), 1], [1, 1], [2, 2]], + [[-1, 1], [1, 10], [10, 1]], + ], + "array_2d_vlen_sequence": [ + [[5, 3], [2, 5], [10, 8]], + [], + [[-1, 1], [1, 10]], + ], + # HF 4D array as Spotlight array + "array_4d": [ + [[[[1.0, 1.0, -10.0]]], [[[-1.0, 1.0, -1.0]]], [[[2.0, 1.0, 1.0]]]], + [ + [[[2.0, -3.0, 0.0]]], + [[[3.0, 6.0, -2.0]]], + [[[4.0, float("nan"), 2.0]]], + [[[4.0, float("nan"), 2.0]]], + ], + [[[[3.0, 10.0, 10.0]]], [[[6.0, 3.0, -3.0]]], [[[4.0, 4.0, 4.0]]]], + ], + # HF list as Spotlight embedding + "list_sequence": [[1, 2, 3, 4], [1, 6, 3, 7], [-1, -2, -3, -4]], }, features=datasets.Features( { + "bool": datasets.Value("bool"), + "int": datasets.Value("int32"), + "uint": datasets.Value("uint16"), + "float": datasets.Value("float64"), + "string": datasets.Value("string"), + "label": datasets.ClassLabel( + num_classes=4, names=["foo", "bar", "baz", "barbaz"] + ), "embedding": datasets.Sequence( feature=datasets.Value("float64"), length=4 ), @@ -71,15 +126,36 @@ def create_hf_dataset(num_rows: int) -> None: feature=datasets.Value("float64"), length=2 ), ), + "sequence_2d_array": datasets.Sequence( + feature=datasets.Sequence(feature=datasets.Value("float64")), + length=3, + ), + "sequence_3d_array": datasets.Sequence( + feature=datasets.Sequence( + feature=datasets.Sequence(feature=datasets.Value("float64")), + length=3, + ), + length=1, + ), + "array_2d_sequence": datasets.Array2D(shape=(2, 3), dtype="float64"), + "array_2d_t_sequence": datasets.Array2D(shape=(3, 2), dtype="float64"), + "array_2d_vlen_sequence": datasets.Array2D( + shape=(None, 2), dtype="float64" + ), + "array_4d": datasets.Array4D(shape=(None, 1, 1, 3), dtype="float64"), + "list_sequence": [datasets.Value("float64")], } ), # info=datasets.DatasetInfo(), # split=datasets.NamedSplit, ) - ds.save_to_disk("build/datasets/hf") + ds.save_to_disk("./build/datasets/hf") print(ds.features) if __name__ == "__main__": np.random.seed(42) create_hf_dataset(100) + + ds = datasets.load_from_disk("./build/datasets/hf") + spotlight.show(ds) From d922975529d2266fbb2f26442d08e539ce24d002 Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Tue, 26 Sep 2023 15:16:17 +0200 Subject: [PATCH 4/8] Test HF dataset in integration tests --- .../core/huggingface_datasource.py | 17 +- scripts/create_hf_dataset.py | 161 ------------------ tests/integration/huggingface/__init__.py | 0 tests/integration/huggingface/conftest.py | 17 ++ tests/integration/huggingface/dataset.py | 146 ++++++++++++++++ tests/integration/huggingface/test_hf.py | 37 ++++ 6 files changed, 214 insertions(+), 164 deletions(-) delete mode 100755 scripts/create_hf_dataset.py create mode 100644 tests/integration/huggingface/__init__.py create mode 100644 tests/integration/huggingface/conftest.py create mode 100644 tests/integration/huggingface/dataset.py create mode 100644 tests/integration/huggingface/test_hf.py diff --git a/renumics/spotlight_plugins/core/huggingface_datasource.py b/renumics/spotlight_plugins/core/huggingface_datasource.py index 0206160c..14e921bf 100644 --- a/renumics/spotlight_plugins/core/huggingface_datasource.py +++ b/renumics/spotlight_plugins/core/huggingface_datasource.py @@ -161,6 +161,15 @@ def get_column_values( if isinstance(feature, datasets.Translation): return np.array([str(value) for value in raw_values]) + if isinstance(feature, datasets.Value): + hf_dtype = feature.dtype + if hf_dtype.startswith("duration"): + return raw_values.to_numpy().astype(int) + if hf_dtype.startswith("time32") or hf_dtype.startswith("time64"): + return raw_values.to_numpy().astype(str) + if hf_dtype.startswith("timestamp[ns"): + return raw_values.to_numpy().astype(int) + return raw_values.to_numpy() def get_column_metadata(self, _: str) -> ColumnMetadata: @@ -187,17 +196,19 @@ def _get_intermediate_dtype(feature: _FeatureType) -> DType: elif hf_dtype.startswith("float"): return float_dtype elif hf_dtype.startswith("time32"): - return datetime_dtype + return str_dtype elif hf_dtype.startswith("time64"): - return datetime_dtype + return str_dtype elif hf_dtype.startswith("timestamp"): + if hf_dtype.startswith("timestamp[ns"): + return int_dtype return datetime_dtype elif hf_dtype.startswith("date32"): return datetime_dtype elif hf_dtype.startswith("date64"): return datetime_dtype elif hf_dtype.startswith("duration"): - return float_dtype + return int_dtype elif hf_dtype.startswith("decimal"): return float_dtype elif hf_dtype == "binary": diff --git a/scripts/create_hf_dataset.py b/scripts/create_hf_dataset.py deleted file mode 100755 index db3006fa..00000000 --- a/scripts/create_hf_dataset.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python3 - -""" -This script creates multimodal Hugging Face dataset to test Spotlight on. -""" - -import datasets -import numpy as np -from renumics import spotlight - -from renumics.spotlight import dtypes - - -def random_values( - dtype: dtypes.DType, num_rows: int, optional: bool = False -) -> np.ndarray: - if dtypes.is_bool_dtype(dtype): - values = np.random.randint(0, 2, num_rows, bool) - elif dtypes.is_int_dtype(dtype): - values = np.random.randint(0, 2, num_rows, bool) - elif dtypes.is_float_dtype(dtype): - values = np.random.normal(0, 100, num_rows) - # elif dtypes.is_str_dtype(dtype): - # str_lengths = np.random.randint(0, 100, num_rows) - # null_indices = np.random.randint(0, num_rows, num_rows // 10) - # str_lengths[null_indices] = 0 - # all_letters = np.array( - # list(string.ascii_letters + string.digits + string.punctuation) - # ) - else: - raise NotImplementedError - - if not optional: - return values - - null_indices = np.random.randint(0, num_rows, num_rows // 10) - if np.issubdtype(values.dtype, np.floating): - values[null_indices] = np.nan - else: - values = values.astype(object) - values[null_indices] = None - return values - - -def create_hf_dataset(num_rows: int) -> None: - ds = datasets.Dataset.from_dict( - { - "bool": [True, False, False], - "int": [-1, 1, 100000], - "uint": [1, 1, 30000], - "float": [1.0, float("nan"), 1000], - "string": ["foo", "barbaz", ""], - "label": ["foo", "bar", "foo"], - # "int": random_values(dtypes.int_dtype, num_rows), - # "float": random_values(dtypes.float_dtype, num_rows), - "embedding": [[1, 2, 3, 4], [1, 6, 3, 7], [-1, -2, -3, -4]], - # HF sequence as Spotlight sequence - "sequence_1d": [[1, 2, 3, 4], [1, 6, 3], [-1, -2, float("nan"), -4, 10]], - "sequence_2d": [ - [[1, 2, 3, 4], [-1, 3, 1, 6]], - [[1, -3, 10], [1, 6, 3]], - [[-10, 0, 10], [-1, -2, -3]], - ], - "sequence_2d_t": [[[5, 3], [2, 5], [10, 8]], [], [[-1, 1], [1, 10]]], - # HF sequence as Spotlight array - "sequence_2d_array": [ - [[1, 2, 3, 4], [-1, 3, 1, 6], [1, 2, 4, 4]], - [[1, -3, 10], [1, 6, 3], [1, float("nan"), 4]], - [[-10, 0, 10], [-1, -2, -3], [1, 2, 4]], - ], - "sequence_3d_array": [ - [[[1, 2, 3, 4], [-1, 3, 1, 6], [1, 2, 4, 4]]], - [[[1, -3, 10], [1, 6, 3], [1, float("nan"), 4]]], - [[[-10, 0, 10], [-1, -2, -3], [1, 2, 4]]], - ], - # HF 2D array as Spotlight sequence - "array_2d_sequence": [ - [[1, 2, 3], [-1, 3, 1]], - [[1, -3, 10], [1, 6, 3]], - [[-10, 0, 10], [-1, -2, -3]], - ], - "array_2d_t_sequence": [ - [[5, 3], [2, 5], [10, 8]], - [[float("nan"), 1], [1, 1], [2, 2]], - [[-1, 1], [1, 10], [10, 1]], - ], - "array_2d_vlen_sequence": [ - [[5, 3], [2, 5], [10, 8]], - [], - [[-1, 1], [1, 10]], - ], - # HF 4D array as Spotlight array - "array_4d": [ - [[[[1.0, 1.0, -10.0]]], [[[-1.0, 1.0, -1.0]]], [[[2.0, 1.0, 1.0]]]], - [ - [[[2.0, -3.0, 0.0]]], - [[[3.0, 6.0, -2.0]]], - [[[4.0, float("nan"), 2.0]]], - [[[4.0, float("nan"), 2.0]]], - ], - [[[[3.0, 10.0, 10.0]]], [[[6.0, 3.0, -3.0]]], [[[4.0, 4.0, 4.0]]]], - ], - # HF list as Spotlight embedding - "list_sequence": [[1, 2, 3, 4], [1, 6, 3, 7], [-1, -2, -3, -4]], - }, - features=datasets.Features( - { - "bool": datasets.Value("bool"), - "int": datasets.Value("int32"), - "uint": datasets.Value("uint16"), - "float": datasets.Value("float64"), - "string": datasets.Value("string"), - "label": datasets.ClassLabel( - num_classes=4, names=["foo", "bar", "baz", "barbaz"] - ), - "embedding": datasets.Sequence( - feature=datasets.Value("float64"), length=4 - ), - "sequence_1d": datasets.Sequence(feature=datasets.Value("float64")), - "sequence_2d": datasets.Sequence( - feature=datasets.Sequence(feature=datasets.Value("float64")), - length=2, - ), - "sequence_2d_t": datasets.Sequence( - feature=datasets.Sequence( - feature=datasets.Value("float64"), length=2 - ), - ), - "sequence_2d_array": datasets.Sequence( - feature=datasets.Sequence(feature=datasets.Value("float64")), - length=3, - ), - "sequence_3d_array": datasets.Sequence( - feature=datasets.Sequence( - feature=datasets.Sequence(feature=datasets.Value("float64")), - length=3, - ), - length=1, - ), - "array_2d_sequence": datasets.Array2D(shape=(2, 3), dtype="float64"), - "array_2d_t_sequence": datasets.Array2D(shape=(3, 2), dtype="float64"), - "array_2d_vlen_sequence": datasets.Array2D( - shape=(None, 2), dtype="float64" - ), - "array_4d": datasets.Array4D(shape=(None, 1, 1, 3), dtype="float64"), - "list_sequence": [datasets.Value("float64")], - } - ), - # info=datasets.DatasetInfo(), - # split=datasets.NamedSplit, - ) - ds.save_to_disk("./build/datasets/hf") - print(ds.features) - - -if __name__ == "__main__": - np.random.seed(42) - create_hf_dataset(100) - - ds = datasets.load_from_disk("./build/datasets/hf") - spotlight.show(ds) diff --git a/tests/integration/huggingface/__init__.py b/tests/integration/huggingface/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/huggingface/conftest.py b/tests/integration/huggingface/conftest.py new file mode 100644 index 00000000..5f6f089b --- /dev/null +++ b/tests/integration/huggingface/conftest.py @@ -0,0 +1,17 @@ +""" +Pytest Fixtures for Hugging Face tests +""" + +import datasets +import pytest + +from .dataset import create_hf_dataset + + +@pytest.fixture +def dataset() -> datasets.Dataset: + """ + H5 Dataset for tests + """ + + return create_hf_dataset() diff --git a/tests/integration/huggingface/dataset.py b/tests/integration/huggingface/dataset.py new file mode 100644 index 00000000..3b81abb6 --- /dev/null +++ b/tests/integration/huggingface/dataset.py @@ -0,0 +1,146 @@ +""" +Data for Hugging Face tests +""" + +import datetime + +import datasets + + +DATA = { + "bool": [True, False, False], + "int": [-1, 1, 100000], + "uint": [1, 1, 30000], + "float": [1.0, float("nan"), 1000], + "string": ["foo", "barbaz", ""], + "label": ["foo", "bar", "foo"], + "binary": [b"foo", b"bar", b""], + "duration": [-1, 2, 10], + "decimal": [1.0, 3.0, 1000], + "date": [datetime.date.min, datetime.date(2001, 2, 15), datetime.date.max], + "time": [ + datetime.time.min, + datetime.time(14, 24, 15, 2672), + datetime.time.max, + ], + "timestamp": [ + datetime.datetime(1970, 2, 15, 14, 24, 15, 2672), + datetime.datetime(2001, 2, 15, 14, 24, 15, 2672), + datetime.datetime(2170, 2, 15, 14, 24, 15, 2672), + ], + "timestamp_ns": [ + datetime.datetime(1970, 2, 15, 14, 24, 15, 2672), + datetime.datetime(2001, 2, 15, 14, 24, 15, 2672), + datetime.datetime(2170, 2, 15, 14, 24, 15, 2672), + ], + "embedding": [[1, 2, 3, 4], [1, 6, 3, 7], [-1, -2, -3, -4]], + "audio": [ + "data/audio/mono/gs-16b-1c-44100hz.mp3", + "data/audio/1.wav", + "data/audio/stereo/gs-16b-2c-44100hz.ogg", + ], + "image": [ + "data/images/nature-256p.ico", + "data/images/sea-360p.gif", + "data/images/nature-360p.jpg", + ], + # HF sequence as Spotlight sequence + "sequence_1d": [[1, 2, 3, 4], [1, 6, 3], [-1, -2, float("nan"), -4, 10]], + "sequence_2d": [ + [[1, 2, 3, 4], [-1, 3, 1, 6]], + [[1, -3, 10], [1, 6, 3]], + [[-10, 0, 10], [-1, -2, -3]], + ], + "sequence_2d_t": [[[5, 3], [2, 5], [10, 8]], [], [[-1, 1], [1, 10]]], + # HF sequence as Spotlight array + "sequence_2d_array": [ + [[1, 2, 3, 4], [-1, 3, 1, 6], [1, 2, 4, 4]], + [[1, -3, 10], [1, 6, 3], [1, float("nan"), 4]], + [[-10, 0, 10], [-1, -2, -3], [1, 2, 4]], + ], + "sequence_3d_array": [ + [[[1, 2, 3, 4], [-1, 3, 1, 6], [1, 2, 4, 4]]], + [[[1, -3, 10], [1, 6, 3], [1, float("nan"), 4]]], + [[[-10, 0, 10], [-1, -2, -3], [1, 2, 4]]], + ], + # HF 2D array as Spotlight sequence + "array_2d_sequence": [ + [[1, 2, 3], [-1, 3, 1]], + [[1, -3, 10], [1, 6, 3]], + [[-10, 0, 10], [-1, -2, -3]], + ], + "array_2d_t_sequence": [ + [[5, 3], [2, 5], [10, 8]], + [[float("nan"), 1], [1, 1], [2, 2]], + [[-1, 1], [1, 10], [10, 1]], + ], + "array_2d_vlen_sequence": [ + [[5, 3], [2, 5], [10, 8]], + [], + [[-1, 1], [1, 10]], + ], + # HF 4D array as Spotlight array + "array_4d": [ + [[[[1.0, 1.0, -10.0]]], [[[-1.0, 1.0, -1.0]]], [[[2.0, 1.0, 1.0]]]], + [ + [[[2.0, -3.0, 0.0]]], + [[[3.0, 6.0, -2.0]]], + [[[4.0, float("nan"), 2.0]]], + [[[4.0, float("nan"), 2.0]]], + ], + [[[[3.0, 10.0, 10.0]]], [[[6.0, 3.0, -3.0]]], [[[4.0, 4.0, 4.0]]]], + ], + # HF list as Spotlight embedding + "list_sequence": [[1, 2, 3], [1, 6, 3, 7, 8], [-1, -2, -3, -4]], +} + +FEATURES = { + "bool": datasets.Value("bool"), + "int": datasets.Value("int32"), + "uint": datasets.Value("uint16"), + "float": datasets.Value("float64"), + "string": datasets.Value("string"), + "label": datasets.ClassLabel(num_classes=4, names=["foo", "bar", "baz", "barbaz"]), + "binary": datasets.Value("binary"), + "duration": datasets.Value("duration[s]"), + "decimal": datasets.Value("decimal128(10, 2)"), + "date": datasets.Value("date32"), + "time": datasets.Value("time64[us]"), + "timestamp": datasets.Value("timestamp[us]"), + "timestamp_ns": datasets.Value("timestamp[ns]"), + "audio": datasets.Audio(), + "image": datasets.Image(), + "embedding": datasets.Sequence(feature=datasets.Value("float64"), length=4), + "sequence_1d": datasets.Sequence(feature=datasets.Value("float64")), + "sequence_2d": datasets.Sequence( + feature=datasets.Sequence(feature=datasets.Value("float64")), + length=2, + ), + "sequence_2d_t": datasets.Sequence( + feature=datasets.Sequence(feature=datasets.Value("float64"), length=2), + ), + "sequence_2d_array": datasets.Sequence( + feature=datasets.Sequence(feature=datasets.Value("float64")), + length=3, + ), + "sequence_3d_array": datasets.Sequence( + feature=datasets.Sequence( + feature=datasets.Sequence(feature=datasets.Value("float64")), + length=3, + ), + length=1, + ), + "array_2d_sequence": datasets.Array2D(shape=(2, 3), dtype="float64"), + "array_2d_t_sequence": datasets.Array2D(shape=(3, 2), dtype="float64"), + "array_2d_vlen_sequence": datasets.Array2D(shape=(None, 2), dtype="float64"), + "array_4d": datasets.Array4D(shape=(None, 1, 1, 3), dtype="float64"), + "list_sequence": [datasets.Value("float64")], +} + + +def create_hf_dataset() -> datasets.Dataset: + ds = datasets.Dataset.from_dict( + DATA, + features=datasets.Features(FEATURES), + ) + return ds diff --git a/tests/integration/huggingface/test_hf.py b/tests/integration/huggingface/test_hf.py new file mode 100644 index 00000000..9380380f --- /dev/null +++ b/tests/integration/huggingface/test_hf.py @@ -0,0 +1,37 @@ +""" +Integration Test on API level for h5 data sources +""" +import pytest +import httpx + +import datasets + +from renumics import spotlight + +from .dataset import DATA + + +def test_get_table_returns_http_ok(dataset: datasets.Dataset) -> None: + """ + Ensure /api/table/ returns a valid response + """ + viewer = spotlight.show(dataset, no_browser=True, wait=False) + response = httpx.Client(base_url=viewer.url).get("/api/table/") + viewer.close() + assert response.status_code == 200 + + +@pytest.mark.parametrize("col", DATA.keys()) +def test_get_cell_returns_http_ok(dataset: str, col: str) -> None: + """ + Serve h5 dataset and get cell data for dtype + """ + viewer = spotlight.show(dataset, no_browser=True, wait=False) + gen_id = ( + httpx.Client(base_url=viewer.url).get("/api/table/").json()["generation_id"] + ) + response = httpx.Client(base_url=viewer.url).get( + f"/api/table/{col}/0?generation_id={gen_id}" + ) + viewer.close() + assert response.status_code == 200 From ba66e064d004eecc56d38dd525e44cb3b8c9058a Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Tue, 26 Sep 2023 15:31:03 +0200 Subject: [PATCH 5/8] Fix types --- renumics/spotlight/data_store.py | 2 ++ renumics/spotlight_plugins/core/huggingface_datasource.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/renumics/spotlight/data_store.py b/renumics/spotlight/data_store.py index 90f6ed5c..42867e5a 100644 --- a/renumics/spotlight/data_store.py +++ b/renumics/spotlight/data_store.py @@ -183,6 +183,8 @@ def _guess_dtype(self, col: str) -> DType: def _intermediate_to_semantic_dtype(intermediate_dtype: DType) -> DType: if is_array_dtype(intermediate_dtype): + if intermediate_dtype.shape is None: + return intermediate_dtype if intermediate_dtype.shape == (2,): return window_dtype if intermediate_dtype.ndim == 1 and intermediate_dtype.shape[0] is not None: diff --git a/renumics/spotlight_plugins/core/huggingface_datasource.py b/renumics/spotlight_plugins/core/huggingface_datasource.py index 14e921bf..afcfd20d 100644 --- a/renumics/spotlight_plugins/core/huggingface_datasource.py +++ b/renumics/spotlight_plugins/core/huggingface_datasource.py @@ -186,7 +186,7 @@ def _guess_semantic_dtype(feature: _FeatureType) -> Optional[DType]: def _get_intermediate_dtype(feature: _FeatureType) -> DType: if isinstance(feature, datasets.Value): - hf_dtype = cast(datasets.Value, feature).dtype + hf_dtype = datasets.Value.dtype if hf_dtype == "bool": return bool_dtype elif hf_dtype.startswith("int"): From 1fe3647d2ea2891253b5f291f7ae7fe52f4ec5e5 Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Tue, 26 Sep 2023 15:39:55 +0200 Subject: [PATCH 6/8] Fix types --- renumics/spotlight_plugins/core/huggingface_datasource.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/renumics/spotlight_plugins/core/huggingface_datasource.py b/renumics/spotlight_plugins/core/huggingface_datasource.py index afcfd20d..ca52841b 100644 --- a/renumics/spotlight_plugins/core/huggingface_datasource.py +++ b/renumics/spotlight_plugins/core/huggingface_datasource.py @@ -232,6 +232,8 @@ def _get_intermediate_dtype(feature: _FeatureType) -> DType: if is_int_dtype(inner_dtype) or is_float_dtype(inner_dtype): return ArrayDType((None if feature.length == -1 else feature.length,)) if is_array_dtype(inner_dtype): + if inner_dtype.shape is None: + return str_dtype shape = ( None if feature.length == -1 else feature.length, *inner_dtype.shape, @@ -245,6 +247,8 @@ def _get_intermediate_dtype(feature: _FeatureType) -> DType: if is_int_dtype(inner_dtype) or is_float_dtype(inner_dtype): return ArrayDType((None,)) if is_array_dtype(inner_dtype): + if inner_dtype.shape is None: + return str_dtype shape = (None, *inner_dtype.shape) if shape.count(None) > 1: return str_dtype From a3a675d195cd52487201d84db82efb9179d00a1f Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Tue, 26 Sep 2023 15:46:39 +0200 Subject: [PATCH 7/8] Infer unknown HF features as string --- renumics/spotlight_plugins/core/huggingface_datasource.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/renumics/spotlight_plugins/core/huggingface_datasource.py b/renumics/spotlight_plugins/core/huggingface_datasource.py index ca52841b..77e3aa22 100644 --- a/renumics/spotlight_plugins/core/huggingface_datasource.py +++ b/renumics/spotlight_plugins/core/huggingface_datasource.py @@ -266,8 +266,7 @@ def _get_intermediate_dtype(feature: _FeatureType) -> DType: return str_dtype elif isinstance(feature, datasets.Translation): return str_dtype - else: - raise UnsupportedFeature(feature) + return str_dtype def _convert_object_array(value: np.ndarray) -> np.ndarray: From 03e3e0f71b5f42e8e77691d227694e30e1ce7f67 Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Tue, 26 Sep 2023 15:52:59 +0200 Subject: [PATCH 8/8] Fallback to string dtype for unsupported Hugging Face features --- renumics/spotlight_plugins/core/huggingface_datasource.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/renumics/spotlight_plugins/core/huggingface_datasource.py b/renumics/spotlight_plugins/core/huggingface_datasource.py index 77e3aa22..870245cb 100644 --- a/renumics/spotlight_plugins/core/huggingface_datasource.py +++ b/renumics/spotlight_plugins/core/huggingface_datasource.py @@ -26,6 +26,7 @@ str_dtype, ) from renumics.spotlight.data_source.data_source import ColumnMetadata +from renumics.spotlight.logging import logger _FeatureType = Union[ @@ -186,7 +187,7 @@ def _guess_semantic_dtype(feature: _FeatureType) -> Optional[DType]: def _get_intermediate_dtype(feature: _FeatureType) -> DType: if isinstance(feature, datasets.Value): - hf_dtype = datasets.Value.dtype + hf_dtype = feature.dtype if hf_dtype == "bool": return bool_dtype elif hf_dtype.startswith("int"): @@ -220,7 +221,8 @@ def _get_intermediate_dtype(feature: _FeatureType) -> DType: elif hf_dtype == "large_string": return str_dtype else: - raise UnsupportedFeature(feature) + logger.warning(f"Unsupported Hugging Face value dtype: {hf_dtype}.") + return str_dtype elif isinstance(feature, datasets.ClassLabel): return CategoryDType(categories=cast(datasets.ClassLabel, feature).names) elif isinstance(feature, datasets.Audio): @@ -266,6 +268,7 @@ def _get_intermediate_dtype(feature: _FeatureType) -> DType: return str_dtype elif isinstance(feature, datasets.Translation): return str_dtype + logger.warning(f"Unsupported Hugging Face feature: {feature}.") return str_dtype