diff --git a/renumics/spotlight/app_config.py b/renumics/spotlight/app_config.py index 0e62bf06..15bcf217 100644 --- a/renumics/spotlight/app_config.py +++ b/renumics/spotlight/app_config.py @@ -4,9 +4,8 @@ from dataclasses import dataclass from pathlib import Path -from typing import List, Optional, Union +from typing import List, Optional, Union, Any -import pandas as pd from renumics.spotlight.layout.nodes import Layout from renumics.spotlight.analysis.typing import DataIssue @@ -20,7 +19,7 @@ class AppConfig: """ # dataset - dataset: Optional[Union[Path, pd.DataFrame]] = None + dataset: Any = None dtypes: Optional[DTypeMap] = None project_root: Optional[Path] = None diff --git a/renumics/spotlight/data_source/data_source.py b/renumics/spotlight/data_source/data_source.py index 50536a74..a4971312 100644 --- a/renumics/spotlight/data_source/data_source.py +++ b/renumics/spotlight/data_source/data_source.py @@ -89,10 +89,11 @@ def check_generation_id(self, generation_id: int) -> None: if self.get_generation_id() != generation_id: raise GenerationIDMismatch() + @property @abstractmethod - def guess_dtypes(self) -> DTypeMap: + def semantic_dtypes(self) -> DTypeMap: """ - Guess data source's dtypes. + Semantic dtypes for viewer. """ @abstractmethod diff --git a/renumics/spotlight/data_store.py b/renumics/spotlight/data_store.py index 26a8d3e0..7d70f9bc 100644 --- a/renumics/spotlight/data_store.py +++ b/renumics/spotlight/data_store.py @@ -1,7 +1,12 @@ import hashlib import io -from typing import List, Optional, Set, Union, cast +import os +import statistics +from typing import Any, List, Optional, Set, Union, cast import numpy as np +import filetype +import trimesh +import PIL.Image from renumics.spotlight.cache import external_data_cache from renumics.spotlight.data_source import DataSource @@ -10,13 +15,33 @@ from renumics.spotlight.io import audio from renumics.spotlight.dtypes import ( CategoryDType, + DType, DTypeMap, is_audio_dtype, is_category_dtype, + is_file_dtype, is_str_dtype, + is_mixed_dtype, + is_bytes_dtype, str_dtype, + audio_dtype, + image_dtype, + video_dtype, + mesh_dtype, + embedding_dtype, + array_dtype, + window_dtype, + sequence_1d_dtype, ) +from renumics.spotlight.typing import is_iterable, is_pathtype +from renumics.spotlight.media.mesh import Mesh +from renumics.spotlight.media.video import Video +from renumics.spotlight.media.audio import Audio +from renumics.spotlight.media.image import Image +from renumics.spotlight.media.sequence_1d import Sequence1D +from renumics.spotlight.media.embedding import Embedding + class DataStore: _data_source: DataSource @@ -102,7 +127,14 @@ def get_waveform(self, column_name: str, index: int) -> Optional[np.ndarray]: return waveform def _update_dtypes(self) -> None: - guessed_dtypes = self._data_source.guess_dtypes() + guessed_dtypes = self._data_source.semantic_dtypes.copy() + + # guess missing dtypes from intermediate dtypes + for col, dtype in self._data_source.intermediate_dtypes.items(): + if col not in guessed_dtypes: + guessed_dtypes[col] = self._guess_dtype(col) + + # merge guessed semantic dtypes with user dtypes dtypes = { **guessed_dtypes, **{ @@ -111,6 +143,8 @@ def _update_dtypes(self) -> None: if column_name in guessed_dtypes }, } + + # determine categories for _automatic_ CategoryDtypes for column_name, dtype in dtypes.items(): if ( is_category_dtype(dtype) @@ -124,4 +158,91 @@ def _update_dtypes(self) -> None: ] category_names = sorted(cast(Set[str], set(converted_values))) dtypes[column_name] = CategoryDType(category_names) + self._dtypes = dtypes + + def _guess_dtype(self, col: str) -> DType: + intermediate_dtype = self._data_source.intermediate_dtypes[col] + fallback_dtype = _intermediate_to_semantic_dtype(intermediate_dtype) + + sample_values = self._data_source.get_column_values(col, slice(10)) + sample_dtypes = [_guess_value_dtype(value) for value in sample_values] + + try: + mode_dtype = statistics.mode(sample_dtypes) + except statistics.StatisticsError: + return fallback_dtype + + return mode_dtype or fallback_dtype + + +def _intermediate_to_semantic_dtype(intermediate_dtype: DType) -> DType: + if is_file_dtype(intermediate_dtype): + return str_dtype + if is_mixed_dtype(intermediate_dtype): + return str_dtype + if is_bytes_dtype(intermediate_dtype): + return str_dtype + else: + return intermediate_dtype + + +def _guess_value_dtype(value: Any) -> Optional[DType]: + """ + Infer dtype for value + """ + if isinstance(value, Embedding): + return embedding_dtype + if isinstance(value, Sequence1D): + return sequence_1d_dtype + if isinstance(value, Image): + return image_dtype + if isinstance(value, Audio): + return audio_dtype + if isinstance(value, Video): + return video_dtype + if isinstance(value, Mesh): + return mesh_dtype + if isinstance(value, PIL.Image.Image): + return image_dtype + if isinstance(value, trimesh.Trimesh): + return mesh_dtype + if isinstance(value, np.ndarray): + return _infer_array_dtype(value) + + if isinstance(value, bytes) or (is_pathtype(value) and os.path.isfile(value)): + kind = filetype.guess(value) + if kind is not None: + mime_group = kind.mime.split("/")[0] + if mime_group == "image": + return image_dtype + if mime_group == "audio": + return audio_dtype + if mime_group == "video": + return video_dtype + return str_dtype + if is_iterable(value): + try: + value = np.asarray(value, dtype=float) + except (TypeError, ValueError): + pass + else: + return _infer_array_dtype(value) + return None + + +def _infer_array_dtype(value: np.ndarray) -> DType: + """ + Infer dtype of a numpy array + """ + if value.ndim == 3: + if value.shape[-1] in (1, 3, 4): + return image_dtype + elif value.ndim == 2: + if value.shape[0] == 2 or value.shape[1] == 2: + return sequence_1d_dtype + elif value.ndim == 1: + if len(value) == 2: + return window_dtype + return embedding_dtype + return array_dtype diff --git a/renumics/spotlight/dataset/__init__.py b/renumics/spotlight/dataset/__init__.py index 612bba74..99828ddf 100644 --- a/renumics/spotlight/dataset/__init__.py +++ b/renumics/spotlight/dataset/__init__.py @@ -245,7 +245,7 @@ def _user_column_attributes(dtype: spotlight_dtypes.DType) -> Dict[str, Type]: if spotlight_dtypes.is_sequence_1d_dtype(dtype): attribute_names["x_label"] = str attribute_names["y_label"] = str - if spotlight_dtypes.is_file_dtype(dtype): + if spotlight_dtypes.is_filebased_dtype(dtype): attribute_names["lookup"] = dict attribute_names["external"] = bool if spotlight_dtypes.is_audio_dtype(dtype): @@ -758,7 +758,7 @@ def from_pandas( column = prepare_column(column, dtype) - if workdir is not None and spotlight_dtypes.is_file_dtype(dtype): + if workdir is not None and spotlight_dtypes.is_filebased_dtype(dtype): # For file-based data types, relative paths should be resolved. str_mask = is_string_mask(column) column[str_mask] = column[str_mask].apply( @@ -777,7 +777,7 @@ def from_pandas( else: values = column.to_numpy() - if spotlight_dtypes.is_file_dtype(dtype): + if spotlight_dtypes.is_filebased_dtype(dtype): attrs["external"] = False # type: ignore attrs["lookup"] = False # type: ignore @@ -2435,7 +2435,7 @@ def _append_column( elif spotlight_dtypes.is_sequence_1d_dtype(dtype): attrs["x_label"] = dtype.x_label attrs["y_label"] = dtype.y_label - elif spotlight_dtypes.is_file_dtype(dtype): + elif spotlight_dtypes.is_filebased_dtype(dtype): lookup = attrs.get("lookup", None) if is_iterable(lookup) and not isinstance(lookup, dict): # Assume that we can keep all the lookup values in memory. @@ -3002,7 +3002,7 @@ def _encode_value( if self._is_ref_column(column): value = cast(RefColumnInputType, value) self._assert_valid_value_type(value, dtype, column_name) - if spotlight_dtypes.is_file_dtype(dtype) and isinstance(value, str): + if spotlight_dtypes.is_filebased_dtype(dtype) and isinstance(value, str): try: return self._find_lookup_ref(value, column) except KeyError: diff --git a/renumics/spotlight/dtypes/__init__.py b/renumics/spotlight/dtypes/__init__.py index d7acf3ac..7348097c 100644 --- a/renumics/spotlight/dtypes/__init__.py +++ b/renumics/spotlight/dtypes/__init__.py @@ -97,12 +97,14 @@ def __init__(self, x_label: str = "x", y_label: str = "y"): ALIASES: Dict[Any, DType] = {} -def register_dtype(dtype: DType, aliases: list) -> None: - for alias in aliases: - assert dtype.name.lower() not in ALIASES - ALIASES[dtype.name.lower()] = dtype - assert alias not in ALIASES - ALIASES[alias] = dtype +def register_dtype(dtype: DType, aliases: Optional[list] = None) -> None: + assert dtype.name.lower() not in ALIASES + ALIASES[dtype.name.lower()] = dtype + + if aliases is not None: + for alias in aliases: + assert alias not in ALIASES + ALIASES[alias] = dtype bool_dtype = DType("bool") @@ -150,9 +152,13 @@ def register_dtype(dtype: DType, aliases: list) -> None: video_dtype = DType("Video") """Video dtype""" register_dtype(video_dtype, [Video]) + mixed_dtype = DType("mixed") """Unknown or mixed dtype""" +file_dtype = DType("file") +"""File Dtype (bytes or str(path))""" + DTypeMap = Dict[str, DType] @@ -221,9 +227,21 @@ def is_video_dtype(dtype: DType) -> bool: return dtype.name == "Video" +def is_bytes_dtype(dtype: DType) -> bool: + return dtype.name == "bytes" + + +def is_mixed_dtype(dtype: DType) -> bool: + return dtype.name == "mixed" + + def is_scalar_dtype(dtype: DType) -> bool: return dtype.name in ("bool", "int", "float") def is_file_dtype(dtype: DType) -> bool: - return dtype.name in ("Audio", "Image", "Video", "Mesh") + return dtype.name == "file" + + +def is_filebased_dtype(dtype: DType) -> bool: + return dtype.name in ("Audio", "Image", "Video", "Mesh", "file") diff --git a/renumics/spotlight/io/pandas.py b/renumics/spotlight/io/pandas.py index 3cba1736..4cf84f9e 100644 --- a/renumics/spotlight/io/pandas.py +++ b/renumics/spotlight/io/pandas.py @@ -320,7 +320,7 @@ def prepare_column(column: pd.Series, dtype: dtypes.DType) -> pd.Series: str_mask = is_string_mask(column) column[str_mask] = column[str_mask].apply(try_literal_eval) - if dtypes.is_file_dtype(dtype): + if dtypes.is_filebased_dtype(dtype): dict_mask = column.map(type) == dict column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict) diff --git a/renumics/spotlight/viewer.py b/renumics/spotlight/viewer.py index f50d7c52..b7630b17 100644 --- a/renumics/spotlight/viewer.py +++ b/renumics/spotlight/viewer.py @@ -139,10 +139,8 @@ def show( project_root = dataset else: project_root = dataset.parent - elif isinstance(dataset, pd.DataFrame) or dataset is None: - project_root = None else: - raise TypeError("Dataset has invalid type") + project_root = None if folder: project_root = Path(folder) diff --git a/renumics/spotlight_plugins/core/__init__.py b/renumics/spotlight_plugins/core/__init__.py index 10953077..f741a4dd 100644 --- a/renumics/spotlight_plugins/core/__init__.py +++ b/renumics/spotlight_plugins/core/__init__.py @@ -21,7 +21,11 @@ def __register__() -> None: """ register data sources """ - from . import pandas_data_source, hdf5_data_source # noqa: F401 + from . import ( + pandas_data_source, # noqa: F401 + hdf5_data_source, # noqa: F401 + huggingface_datasource, # noqa: F401 + ) def __activate__(app: SpotlightApp) -> None: diff --git a/renumics/spotlight_plugins/core/hdf5_data_source.py b/renumics/spotlight_plugins/core/hdf5_data_source.py index 677432e7..2acc2678 100644 --- a/renumics/spotlight_plugins/core/hdf5_data_source.py +++ b/renumics/spotlight_plugins/core/hdf5_data_source.py @@ -107,12 +107,13 @@ def column_names(self) -> List[str]: @property def intermediate_dtypes(self) -> DTypeMap: - return self.guess_dtypes() + return self.semantic_dtypes def __len__(self) -> int: return len(self._table) - def guess_dtypes(self) -> DTypeMap: + @property + def semantic_dtypes(self) -> DTypeMap: return { column_name: create_dtype(self._table.get_dtype(column_name)) for column_name in self.column_names diff --git a/renumics/spotlight_plugins/core/huggingface_datasource.py b/renumics/spotlight_plugins/core/huggingface_datasource.py new file mode 100644 index 00000000..49a50f63 --- /dev/null +++ b/renumics/spotlight_plugins/core/huggingface_datasource.py @@ -0,0 +1,209 @@ +from typing import List, Optional, Union, cast + +import datasets +import numpy as np +from renumics.spotlight import dtypes + +from renumics.spotlight.data_source import DataSource +from renumics.spotlight.data_source.decorator import datasource +from renumics.spotlight.dtypes import ( + DType, + DTypeMap, + is_array_dtype, + is_embedding_dtype, + is_file_dtype, + is_float_dtype, + is_int_dtype, +) +from renumics.spotlight.data_source.data_source import ColumnMetadata + + +_FeatureType = Union[ + datasets.Value, + datasets.ClassLabel, + datasets.Sequence, + datasets.Array2D, + datasets.Array3D, + datasets.Array4D, + datasets.Array5D, + datasets.Audio, + datasets.Image, + datasets.Translation, + dict, + list, +] + + +class UnsupportedFeature(Exception): + """ + We encountered an unsupported datasets Feature + """ + + def __init__(self, feature: _FeatureType) -> None: + super().__init__(f"Unsupported HuggingFace Feature: {type(feature)}") + + +@datasource(datasets.Dataset) +class HuggingfaceDataSource(DataSource): + _dataset: datasets.Dataset + _intermediate_dtypes: DTypeMap + _guessed_dtypes: DTypeMap + + def __init__(self, source: datasets.Dataset): + super().__init__(source) + self._dataset = source + self._intermediate_dtypes = { + col: _get_intermediate_dtype(feat) + for col, feat in self._dataset.features.items() + } + self._guessed_dtypes = {} + for col, feat in self._dataset.features.items(): + guessed_dtype = _guess_semantic_dtype(feat) + if guessed_dtype: + self._guessed_dtypes[col] = guessed_dtype + + @property + def column_names(self) -> List[str]: + return self._dataset.column_names + + @property + def intermediate_dtypes(self) -> DTypeMap: + return self._intermediate_dtypes + + def __len__(self) -> int: + return len(self._dataset) + + @property + def semantic_dtypes(self) -> DTypeMap: + return self._guessed_dtypes + + def get_generation_id(self) -> int: + return 0 + + def get_uid(self) -> str: + return self._dataset._fingerprint + + def get_name(self) -> str: + return self._dataset.builder_name + + def get_column_values( + self, + column_name: str, + indices: Union[List[int], np.ndarray, slice] = slice(None), + ) -> np.ndarray: + intermediate_dtype = self._intermediate_dtypes[column_name] + + if isinstance(indices, slice): + if indices == slice(None): + raw_values = self._dataset.data[column_name] + else: + actual_indices = list(range(len(self._dataset)))[indices] + raw_values = self._dataset.data[column_name].take(actual_indices) + else: + raw_values = self._dataset.data[column_name].take(indices) + + feature = self._dataset.features[column_name] + + if isinstance(feature, datasets.Audio) or isinstance(feature, datasets.Image): + return np.array( + [ + value["path"].as_py() + if value["bytes"].as_py() is None + else value["bytes"].as_py() + for value in raw_values + ], + dtype=object, + ) + + if isinstance(feature, dict): + if is_file_dtype(intermediate_dtype): + return np.array( + [value["bytes"].as_py() for value in raw_values], dtype=object + ) + else: + return np.array([str(value) for value in raw_values]) + + if isinstance(feature, datasets.Sequence): + if is_array_dtype(intermediate_dtype): + return raw_values.to_numpy() + if is_embedding_dtype(intermediate_dtype): + return raw_values.to_numpy() + return np.array([str(value) for value in raw_values]) + + if isinstance(feature, datasets.Translation): + return np.array([str(value) for value in raw_values]) + + return raw_values.to_numpy() + + def get_column_metadata(self, _: str) -> ColumnMetadata: + return ColumnMetadata(nullable=True, editable=False) + + +def _guess_semantic_dtype(feature: _FeatureType) -> Optional[DType]: + if isinstance(feature, datasets.Audio): + return dtypes.audio_dtype + if isinstance(feature, datasets.Image): + return dtypes.image_dtype + if isinstance(feature, datasets.Sequence): + if isinstance(feature.feature, datasets.Value): + if feature.length != -1: + return dtypes.embedding_dtype + return None + + +def _get_intermediate_dtype(feature: _FeatureType) -> DType: + if isinstance(feature, datasets.Value): + hf_dtype = cast(datasets.Value, feature).dtype + if hf_dtype == "bool": + return dtypes.bool_dtype + elif hf_dtype.startswith("int"): + return dtypes.int_dtype + elif hf_dtype.startswith("uint"): + return dtypes.int_dtype + elif hf_dtype.startswith("float"): + return dtypes.float_dtype + elif hf_dtype.startswith("time32"): + return dtypes.datetime_dtype + elif hf_dtype.startswith("time64"): + return dtypes.datetime_dtype + elif hf_dtype.startswith("timestamp"): + return dtypes.datetime_dtype + elif hf_dtype.startswith("date32"): + return dtypes.datetime_dtype + elif hf_dtype.startswith("date64"): + return dtypes.datetime_dtype + elif hf_dtype.startswith("duration"): + return dtypes.float_dtype + elif hf_dtype.startswith("decimal"): + return dtypes.float_dtype + elif hf_dtype == "binary": + return dtypes.bytes_dtype + elif hf_dtype == "large_binary": + return dtypes.bytes_dtype + elif hf_dtype == "string": + return dtypes.str_dtype + elif hf_dtype == "large_string": + return dtypes.str_dtype + else: + raise UnsupportedFeature(feature) + elif isinstance(feature, datasets.ClassLabel): + return dtypes.CategoryDType(categories=cast(datasets.ClassLabel, feature).names) + elif isinstance(feature, datasets.Audio): + return dtypes.file_dtype + elif isinstance(feature, datasets.Image): + return dtypes.file_dtype + elif isinstance(feature, datasets.Sequence): + inner_dtype = _get_intermediate_dtype(feature.feature) + if is_int_dtype(inner_dtype) or is_float_dtype(inner_dtype): + return dtypes.array_dtype + else: + return dtypes.str_dtype + elif isinstance(feature, dict): + if len(feature) == 2 and "bytes" in feature and "path" in feature: + return dtypes.file_dtype + else: + return dtypes.str_dtype + elif isinstance(feature, datasets.Translation): + return dtypes.str_dtype + else: + raise UnsupportedFeature(feature) diff --git a/renumics/spotlight_plugins/core/pandas_data_source.py b/renumics/spotlight_plugins/core/pandas_data_source.py index 4d2bf172..cde0e908 100644 --- a/renumics/spotlight_plugins/core/pandas_data_source.py +++ b/renumics/spotlight_plugins/core/pandas_data_source.py @@ -124,7 +124,8 @@ def intermediate_dtypes(self) -> DTypeMap: def __len__(self) -> int: return len(self._df) - def guess_dtypes(self) -> DTypeMap: + @property + def semantic_dtypes(self) -> DTypeMap: return { str(column_name): infer_dtype(self.df[column_name]) for column_name in self.df diff --git a/src/datatypes.ts b/src/datatypes.ts index 6ac98b87..29ed7bca 100644 --- a/src/datatypes.ts +++ b/src/datatypes.ts @@ -35,7 +35,7 @@ export type IntegerDataType = BaseDataType<'int'>; export type FloatDataType = BaseDataType<'float'>; export type BooleanDataType = BaseDataType<'bool'>; export type DateTimeDataType = BaseDataType<'datetime'>; -export type ArrayDataType = BaseDataType<'array'>; +export type ArrayDataType = BaseDataType<'array', true>; export type WindowDataType = BaseDataType<'Window'>; export type StringDataType = BaseDataType<'str', true>; export type EmbeddingDataType = BaseDataType<'Embedding', true>; diff --git a/src/stores/dataset/columnFactory.ts b/src/stores/dataset/columnFactory.ts index 3ec20678..26018d58 100644 --- a/src/stores/dataset/columnFactory.ts +++ b/src/stores/dataset/columnFactory.ts @@ -11,7 +11,6 @@ function makeDatatype(column: Column): DataType { case 'float': case 'bool': case 'Window': - case 'array': case 'datetime': return { kind, @@ -20,6 +19,7 @@ function makeDatatype(column: Column): DataType { optional: column.optional, }; case 'str': + case 'array': case 'Embedding': return { kind,