From 29ce91bbbde79b7585a6d41a1e2b9761829ae31a Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Mon, 16 Oct 2023 15:21:38 +0200 Subject: [PATCH 1/8] Remove unused API model --- renumics/spotlight/data_source/data_source.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/renumics/spotlight/data_source/data_source.py b/renumics/spotlight/data_source/data_source.py index d06133f3..695acb5d 100644 --- a/renumics/spotlight/data_source/data_source.py +++ b/renumics/spotlight/data_source/data_source.py @@ -6,7 +6,6 @@ import pandas as pd import numpy as np -from pydantic.dataclasses import dataclass from renumics.spotlight.dataset.exceptions import ( ColumnExistsError, @@ -30,17 +29,6 @@ class ColumnMetadata: tags: List[str] = dataclasses.field(default_factory=list) -@dataclass -class CellsUpdate: - """ - A dataset's cell update. - """ - - value: Any - author: str - edited_at: str - - class DataSource(ABC): """abstract base class for different data sources""" @@ -61,7 +49,7 @@ def column_names(self) -> List[str]: @abstractmethod def intermediate_dtypes(self) -> DTypeMap: """ - The dtypes of intermediate values + The dtypes of intermediate values. Values for all columns must be filled. """ @property @@ -94,7 +82,7 @@ def check_generation_id(self, generation_id: int) -> None: @abstractmethod def semantic_dtypes(self) -> DTypeMap: """ - Semantic dtypes for viewer. + Semantic dtypes for viewer. Some values may be not present. """ @abstractmethod From f056ab8da32839cae4c451cf5f3a8f5af4a8b852 Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Tue, 17 Oct 2023 09:56:30 +0200 Subject: [PATCH 2/8] Infer pandas dtypes in data store --- renumics/spotlight/data_store.py | 64 ++++++++++++------- renumics/spotlight/io/pandas.py | 11 ---- .../core/pandas_data_source.py | 31 +++++---- 3 files changed, 57 insertions(+), 49 deletions(-) diff --git a/renumics/spotlight/data_store.py b/renumics/spotlight/data_store.py index 42867e5a..416d592c 100644 --- a/renumics/spotlight/data_store.py +++ b/renumics/spotlight/data_store.py @@ -19,13 +19,16 @@ DType, DTypeMap, EmbeddingDType, + array_dtype, is_array_dtype, is_audio_dtype, is_category_dtype, + is_embedding_dtype, is_file_dtype, is_str_dtype, is_mixed_dtype, is_bytes_dtype, + is_window_dtype, str_dtype, audio_dtype, image_dtype, @@ -130,6 +133,8 @@ def get_waveform(self, column_name: str, index: int) -> Optional[np.ndarray]: def _update_dtypes(self) -> None: guessed_dtypes = self._data_source.semantic_dtypes.copy() + print(self._data_source.intermediate_dtypes) + print(guessed_dtypes) # guess missing dtypes from intermediate dtypes for col, dtype in self._data_source.intermediate_dtypes.items(): @@ -171,33 +176,32 @@ def _guess_dtype(self, col: str) -> DType: return semantic_dtype sample_values = self._data_source.get_column_values(col, slice(10)) - sample_dtypes = [_guess_value_dtype(value) for value in sample_values] - - try: - mode_dtype = statistics.mode(sample_dtypes) - except statistics.StatisticsError: + sample_dtypes: List[DType] = [] + for value in sample_values: + guessed_dtype = _guess_value_dtype(value) + if guessed_dtype is not None: + sample_dtypes.append(guessed_dtype) + if not sample_dtypes: return semantic_dtype - return mode_dtype or semantic_dtype + mode_dtype = statistics.mode(sample_dtypes) + # For windows and embeddings, at least sample values must be aligned. + if is_window_dtype(mode_dtype) and any( + not is_window_dtype(dtype) for dtype in sample_dtypes + ): + return array_dtype + if is_embedding_dtype(mode_dtype) and any( + (not is_embedding_dtype(dtype)) or dtype.length != mode_dtype.length + for dtype in sample_dtypes + ): + return array_dtype + + return mode_dtype def _intermediate_to_semantic_dtype(intermediate_dtype: DType) -> DType: if is_array_dtype(intermediate_dtype): - if intermediate_dtype.shape is None: - return intermediate_dtype - if intermediate_dtype.shape == (2,): - return window_dtype - if intermediate_dtype.ndim == 1 and intermediate_dtype.shape[0] is not None: - return EmbeddingDType(intermediate_dtype.shape[0]) - if intermediate_dtype.ndim == 1 and intermediate_dtype.shape[0] is None: - return sequence_1d_dtype - if intermediate_dtype.ndim == 2 and ( - intermediate_dtype.shape[0] == 2 or intermediate_dtype.shape[1] == 2 - ): - return sequence_1d_dtype - if intermediate_dtype.ndim == 3 and intermediate_dtype.shape[-1] in (1, 3, 4): - return image_dtype - return intermediate_dtype + return _guess_array_dtype(intermediate_dtype) if is_file_dtype(intermediate_dtype): return str_dtype if is_mixed_dtype(intermediate_dtype): @@ -248,5 +252,21 @@ def _guess_value_dtype(value: Any) -> Optional[DType]: except (TypeError, ValueError): pass else: - return ArrayDType(value.shape) + return _guess_array_dtype(ArrayDType(value.shape)) return None + + +def _guess_array_dtype(dtype: ArrayDType) -> DType: + if dtype.shape is None: + return dtype + if dtype.shape == (2,): + return window_dtype + if dtype.ndim == 1 and dtype.shape[0] is not None: + return EmbeddingDType(dtype.shape[0]) + if dtype.ndim == 1 and dtype.shape[0] is None: + return sequence_1d_dtype + if dtype.ndim == 2 and (dtype.shape[0] == 2 or dtype.shape[1] == 2): + return sequence_1d_dtype + if dtype.ndim == 3 and dtype.shape[-1] in (1, 3, 4): + return image_dtype + return dtype diff --git a/renumics/spotlight/io/pandas.py b/renumics/spotlight/io/pandas.py index 4cf84f9e..5663fc9a 100644 --- a/renumics/spotlight/io/pandas.py +++ b/renumics/spotlight/io/pandas.py @@ -58,17 +58,6 @@ def create_typed_series( return pd.Series([] if values is None else values, dtype=pandas_dtype) -def is_empty(value: Any) -> bool: - """ - Check if value is `NA` or an empty string. - """ - if is_iterable(value): - # `pd.isna` with an iterable argument returns an iterable result. But - # an iterable cannot be NA or empty string by default. - return False - return pd.isna(value) or value == "" - - def try_literal_eval(x: str) -> Any: """ Try to evaluate a literal expression, otherwise return value as is. diff --git a/renumics/spotlight_plugins/core/pandas_data_source.py b/renumics/spotlight_plugins/core/pandas_data_source.py index 430a69ca..7a02c94d 100644 --- a/renumics/spotlight_plugins/core/pandas_data_source.py +++ b/renumics/spotlight_plugins/core/pandas_data_source.py @@ -7,10 +7,9 @@ import numpy as np import pandas as pd import datasets -from renumics.spotlight import dtypes +from renumics.spotlight import dtypes from renumics.spotlight.io.pandas import ( - infer_dtype, prepare_hugging_face_dict, stringify_columns, try_literal_eval, @@ -23,7 +22,6 @@ from renumics.spotlight.backend.exceptions import DatasetColumnsNotUnique from renumics.spotlight.dataset.exceptions import ColumnNotExistsError from renumics.spotlight.data_source.exceptions import InvalidDataSource -from renumics.spotlight.dtypes import DTypeMap @datasource(pd.DataFrame) @@ -41,6 +39,7 @@ class PandasDataSource(DataSource): _uid: str _df: pd.DataFrame _name: str + _intermediate_dtypes: dtypes.DTypeMap def __init__(self, source: Union[Path, pd.DataFrame]): if isinstance(source, Path): @@ -99,7 +98,9 @@ def __init__(self, source: Union[Path, pd.DataFrame]): raise DatasetColumnsNotUnique() self._generation_id = 0 self._uid = str(id(df)) + print(df.dtypes) self._df = df.convert_dtypes() + print(self._df.dtypes) self._intermediate_dtypes = { # TODO: convert column name col: _determine_intermediate_dtype(self._df[col]) @@ -118,18 +119,15 @@ def df(self) -> pd.DataFrame: return self._df.copy() @property - def intermediate_dtypes(self) -> DTypeMap: + def intermediate_dtypes(self) -> dtypes.DTypeMap: return self._intermediate_dtypes def __len__(self) -> int: return len(self._df) @property - def semantic_dtypes(self) -> DTypeMap: - return { - str(column_name): infer_dtype(self.df[column_name]) - for column_name in self.df - } + def semantic_dtypes(self) -> dtypes.DTypeMap: + return {} def get_generation_id(self) -> int: return self._generation_id @@ -167,12 +165,14 @@ def get_column_values( if pd.api.types.is_categorical_dtype(column): return column.cat.codes if pd.api.types.is_string_dtype(column): - values = column.to_numpy() - na_mask = column.isna() - values[na_mask] = None - return values + column = column.astype(object).mask(column.isna(), None) + str_mask = column.map(type) == str + column[str_mask] = column[str_mask].apply(try_literal_eval) + dict_mask = column.map(type) == dict + column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict) + return column.to_numpy() if pd.api.types.is_object_dtype(column): - column = column.mask(column.isna(), None) + column = column.astype(object).mask(column.isna(), None) str_mask = column.map(type) == str column[str_mask] = column[str_mask].apply(try_literal_eval) dict_mask = column.map(type) == dict @@ -222,5 +222,4 @@ def _determine_intermediate_dtype(column: pd.Series) -> dtypes.DType: return dtypes.datetime_dtype if pd.api.types.is_string_dtype(column): return dtypes.str_dtype - else: - return dtypes.mixed_dtype + return dtypes.mixed_dtype From 44c86e566dd0a29882986f5c0b92e90d2fe98a3f Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Tue, 17 Oct 2023 10:00:57 +0200 Subject: [PATCH 3/8] Remove debug prints --- renumics/spotlight/data_store.py | 2 -- renumics/spotlight_plugins/core/pandas_data_source.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/renumics/spotlight/data_store.py b/renumics/spotlight/data_store.py index 416d592c..6f2bd862 100644 --- a/renumics/spotlight/data_store.py +++ b/renumics/spotlight/data_store.py @@ -133,8 +133,6 @@ def get_waveform(self, column_name: str, index: int) -> Optional[np.ndarray]: def _update_dtypes(self) -> None: guessed_dtypes = self._data_source.semantic_dtypes.copy() - print(self._data_source.intermediate_dtypes) - print(guessed_dtypes) # guess missing dtypes from intermediate dtypes for col, dtype in self._data_source.intermediate_dtypes.items(): diff --git a/renumics/spotlight_plugins/core/pandas_data_source.py b/renumics/spotlight_plugins/core/pandas_data_source.py index 7a02c94d..3eee6ebe 100644 --- a/renumics/spotlight_plugins/core/pandas_data_source.py +++ b/renumics/spotlight_plugins/core/pandas_data_source.py @@ -98,9 +98,7 @@ def __init__(self, source: Union[Path, pd.DataFrame]): raise DatasetColumnsNotUnique() self._generation_id = 0 self._uid = str(id(df)) - print(df.dtypes) self._df = df.convert_dtypes() - print(self._df.dtypes) self._intermediate_dtypes = { # TODO: convert column name col: _determine_intermediate_dtype(self._df[col]) From e57108d6ec3fdaa6495e8b37b9f774144e41218e Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Mon, 23 Oct 2023 10:42:32 +0200 Subject: [PATCH 4/8] Implement __eq__ and __hash__ for dtypes --- renumics/spotlight/dtypes/__init__.py | 44 +++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/renumics/spotlight/dtypes/__init__.py b/renumics/spotlight/dtypes/__init__.py index 0e24ea10..ae96cd7e 100644 --- a/renumics/spotlight/dtypes/__init__.py +++ b/renumics/spotlight/dtypes/__init__.py @@ -36,6 +36,14 @@ def __init__(self, name: str): def __str__(self) -> str: return self.name + def __eq__(self, other: Any) -> bool: + if isinstance(other, DType): + return other._name == self._name + return False + + def __hash__(self) -> int: + return hash(self._name) + @property def name(self) -> str: return self._name @@ -53,8 +61,10 @@ def __init__( self, categories: Optional[Union[Iterable[str], Dict[str, int]]] = None ): super().__init__("Category") - if isinstance(categories, dict) or categories is None: - self._categories = categories + if isinstance(categories, dict): + self._categories = dict(sorted(categories.items(), key=lambda x: x[1])) + elif categories is None: + self._categories = None else: self._categories = { category: code for code, category in enumerate(categories) @@ -71,6 +81,20 @@ def __init__( category: code for code, category in self._inverted_categories.items() } + def __eq__(self, other: Any) -> bool: + if isinstance(other, CategoryDType): + return other._name == self._name and other._categories == self._categories + return False + + def __hash__(self) -> int: + if self._categories is None: + return hash(self._name) ^ hash(None) + return ( + hash(self._name) + ^ hash(tuple(self._categories.keys())) + ^ hash(tuple(self._categories.values())) + ) + @property def categories(self) -> Optional[Dict[str, int]]: return self._categories @@ -91,6 +115,14 @@ def __init__(self, shape: Optional[Tuple[Optional[int], ...]] = None): super().__init__("array") self.shape = shape + def __eq__(self, other: Any) -> bool: + if isinstance(other, ArrayDType): + return other._name == self._name and other.shape == self.shape + return False + + def __hash__(self) -> int: + return hash(self._name) ^ hash(self.shape) + @property def ndim(self) -> int: if self.shape is None: @@ -111,6 +143,14 @@ def __init__(self, length: Optional[int] = None): raise ValueError(f"Length must be non-negative, but {length} received.") self.length = length + def __eq__(self, other: Any) -> bool: + if isinstance(other, EmbeddingDType): + return other._name == self._name and other.length == self.length + return False + + def __hash__(self) -> int: + return hash(self._name) ^ hash(self.length) + class Sequence1DDType(DType): """ From 01eccccc9c1debfabddb61833de2b151df1f7436 Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Mon, 23 Oct 2023 13:40:46 +0200 Subject: [PATCH 5/8] Move `spotlight.io.pandas` to `spotlight.dataset.pandas` --- renumics/spotlight/dataset/__init__.py | 10 +- renumics/spotlight/dtypes/__init__.py | 16 +- renumics/spotlight/io/__init__.py | 16 + renumics/spotlight/io/pandas.py | 316 ------------------ .../core/pandas_data_source.py | 8 +- tests/integration/dataset/test_dataset.py | 2 +- 6 files changed, 34 insertions(+), 334 deletions(-) delete mode 100644 renumics/spotlight/io/pandas.py diff --git a/renumics/spotlight/dataset/__init__.py b/renumics/spotlight/dataset/__init__.py index 99828ddf..e586499d 100644 --- a/renumics/spotlight/dataset/__init__.py +++ b/renumics/spotlight/dataset/__init__.py @@ -32,12 +32,7 @@ from typing_extensions import TypeGuard from renumics.spotlight.__version__ import __version__ -from renumics.spotlight.io.pandas import ( - infer_dtypes, - prepare_column, - is_string_mask, - stringify_columns, -) +from .pandas import create_typed_series, infer_dtypes, is_string_mask, prepare_column from renumics.spotlight.typing import ( BoolType, IndexType, @@ -47,7 +42,6 @@ is_integer, is_iterable, ) -from renumics.spotlight.io.pandas import create_typed_series from renumics.spotlight.dtypes.conversion import prepare_path_or_url from renumics.spotlight import dtypes as spotlight_dtypes @@ -738,7 +732,7 @@ def from_pandas( df = df.reset_index(level=df.index.names) # type: ignore else: df = df.copy() - df.columns = pd.Index(stringify_columns(df)) + df.columns = pd.Index([str(column) for column in df.columns]) if dtypes is None: dtypes = {} diff --git a/renumics/spotlight/dtypes/__init__.py b/renumics/spotlight/dtypes/__init__.py index ae96cd7e..63910215 100644 --- a/renumics/spotlight/dtypes/__init__.py +++ b/renumics/spotlight/dtypes/__init__.py @@ -9,6 +9,8 @@ __all__ = [ "CategoryDType", + "ArrayDType", + "EmbeddingDType", "Sequence1DDType", "bool_dtype", "int_dtype", @@ -83,7 +85,7 @@ def __init__( def __eq__(self, other: Any) -> bool: if isinstance(other, CategoryDType): - return other._name == self._name and other._categories == self._categories + return other._categories == self._categories return False def __hash__(self) -> int: @@ -117,7 +119,7 @@ def __init__(self, shape: Optional[Tuple[Optional[int], ...]] = None): def __eq__(self, other: Any) -> bool: if isinstance(other, ArrayDType): - return other._name == self._name and other.shape == self.shape + return other.shape == self.shape return False def __hash__(self) -> int: @@ -145,7 +147,7 @@ def __init__(self, length: Optional[int] = None): def __eq__(self, other: Any) -> bool: if isinstance(other, EmbeddingDType): - return other._name == self._name and other.length == self.length + return other.length == self.length return False def __hash__(self) -> int: @@ -165,6 +167,14 @@ def __init__(self, x_label: str = "x", y_label: str = "y"): self.x_label = x_label self.y_label = y_label + def __eq__(self, other: Any) -> bool: + if isinstance(other, Sequence1DDType): + return other.x_label == self.x_label and other.y_label == self.y_label + return False + + def __hash__(self) -> int: + return hash(self._name) ^ hash(self.x_label) ^ hash(self.y_label) + ALIASES: Dict[Any, DType] = {} diff --git a/renumics/spotlight/io/__init__.py b/renumics/spotlight/io/__init__.py index 2d2a6d26..8162a843 100644 --- a/renumics/spotlight/io/__init__.py +++ b/renumics/spotlight/io/__init__.py @@ -1,6 +1,9 @@ """ Reading and writing of different data formats. """ +import ast +from contextlib import suppress +from typing import Any from .audio import ( get_format_codec, @@ -19,6 +22,8 @@ decode_gltf_arrays, encode_gltf_array, ) +from .huggingface import prepare_hugging_face_dict + __all__ = [ "get_format_codec", @@ -34,4 +39,15 @@ "check_gltf", "decode_gltf_arrays", "encode_gltf_array", + "prepare_hugging_face_dict", + "try_literal_eval", ] + + +def try_literal_eval(x: str) -> Any: + """ + Try to evaluate a literal expression, otherwise return value as is. + """ + with suppress(Exception): + return ast.literal_eval(x) + return x diff --git a/renumics/spotlight/io/pandas.py b/renumics/spotlight/io/pandas.py deleted file mode 100644 index 5663fc9a..00000000 --- a/renumics/spotlight/io/pandas.py +++ /dev/null @@ -1,316 +0,0 @@ -""" -This module contains helpers for importing `pandas.DataFrame`s. -""" - -import ast -import os.path -import statistics -from contextlib import suppress -from typing import Any, Dict, List, Optional, Sequence, Union - -import PIL.Image -import filetype -import trimesh -import numpy as np -import pandas as pd - -from renumics.spotlight.dtypes import ( - Audio, - Embedding, - Image, - Mesh, - Sequence1D, - Video, -) -from renumics.spotlight.media.exceptions import UnsupportedDType -from renumics.spotlight.typing import is_iterable, is_pathtype -from renumics.spotlight import dtypes - - -def create_typed_series( - dtype: dtypes.DType, values: Optional[Union[Sequence, np.ndarray]] = None -) -> pd.Series: - if dtypes.is_category_dtype(dtype): - if values is None or len(values) == 0: - return pd.Series( - dtype=pd.CategoricalDtype( - [] if not dtype.categories else list(dtype.categories.keys()) - ) - ) - if dtype.inverted_categories is None: - return pd.Series([None] * len(values), dtype=pd.CategoricalDtype()) - return pd.Series( - [dtype.inverted_categories.get(code) for code in values], - dtype=pd.CategoricalDtype(), - ) - if dtypes.is_bool_dtype(dtype): - pandas_dtype = "boolean" - elif dtypes.is_int_dtype(dtype): - pandas_dtype = "Int64" - elif dtypes.is_float_dtype(dtype): - pandas_dtype = "float" - elif dtypes.is_str_dtype(dtype): - pandas_dtype = "string" - elif dtypes.is_datetime_dtype(dtype): - pandas_dtype = "datetime64[ns]" - else: - pandas_dtype = "object" - return pd.Series([] if values is None else values, dtype=pandas_dtype) - - -def try_literal_eval(x: str) -> Any: - """ - Try to evaluate a literal expression, otherwise return value as is. - """ - with suppress(Exception): - return ast.literal_eval(x) - return x - - -def stringify_columns(df: pd.DataFrame) -> List[str]: - """ - Convert `pandas.DataFrame`'s column names to strings, no matter which index - is used. - """ - return [str(column_name) for column_name in df.columns] - - -def infer_dtype(column: pd.Series) -> dtypes.DType: - """ - Get an equivalent Spotlight data type for a `pandas` column, if possible. - - At the moment, only scalar data types can be inferred. - - Nullable boolean and integer `pandas` dtypes have no equivalent Spotlight - data type and will be read as strings. - - Float, string, and category data types are allowed to have `NaN`s. - - Args: - column: A `pandas` column to infer dtype from. - - Returns: - Inferred dtype. - - Raises: - ValueError: If dtype cannot be inferred automatically. - """ - - if pd.api.types.is_bool_dtype(column): - return dtypes.bool_dtype - if pd.api.types.is_categorical_dtype(column): - return dtypes.CategoryDType( - {category: code for code, category in enumerate(column.cat.categories)} - ) - if pd.api.types.is_integer_dtype(column): - return dtypes.int_dtype - if pd.api.types.is_float_dtype(column): - return dtypes.float_dtype - if pd.api.types.is_datetime64_any_dtype(column): - return dtypes.datetime_dtype - - column = column.copy() - str_mask = is_string_mask(column) - column[str_mask] = column[str_mask].replace("", None) - - column = column[~column.isna()] - if len(column) == 0: - return dtypes.str_dtype - - column_head = column.iloc[:10] - head_dtypes = column_head.apply(infer_value_dtype).to_list() # type: ignore - dtype_mode = statistics.mode(head_dtypes) - - if dtype_mode is None: - return dtypes.str_dtype - if dtype_mode in [dtypes.window_dtype, dtypes.embedding_dtype]: - column = column.astype(object) - str_mask = is_string_mask(column) - x = column[str_mask].apply(try_literal_eval) - column[str_mask] = x - dict_mask = column.map(type) == dict - column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict) - try: - np.asarray(column.to_list(), dtype=float) - except (TypeError, ValueError): - return dtypes.sequence_1d_dtype - return dtype_mode - return dtype_mode - - -def infer_value_dtype(value: Any) -> Optional[dtypes.DType]: - """ - Infer dtype for value - """ - if isinstance(value, Embedding): - return dtypes.embedding_dtype - if isinstance(value, Sequence1D): - return dtypes.sequence_1d_dtype - if isinstance(value, Image): - return dtypes.image_dtype - if isinstance(value, Audio): - return dtypes.audio_dtype - if isinstance(value, Video): - return dtypes.video_dtype - if isinstance(value, Mesh): - return dtypes.mesh_dtype - if isinstance(value, PIL.Image.Image): - return dtypes.image_dtype - if isinstance(value, trimesh.Trimesh): - return dtypes.mesh_dtype - if isinstance(value, np.ndarray): - return infer_array_dtype(value) - - # When `pandas` reads a csv, arrays and lists are read as literal strings, - # try to interpret them. - value = try_literal_eval(value) - if isinstance(value, dict): - value = prepare_hugging_face_dict(value) - if isinstance(value, bytes) or (is_pathtype(value) and os.path.isfile(value)): - kind = filetype.guess(value) - if kind is not None: - mime_group = kind.mime.split("/")[0] - if mime_group == "image": - return dtypes.image_dtype - if mime_group == "audio": - return dtypes.audio_dtype - if mime_group == "video": - return dtypes.video_dtype - return None - if is_iterable(value): - try: - value = np.asarray(value, dtype=float) - except (TypeError, ValueError): - pass - else: - return infer_array_dtype(value) - return None - - -def infer_array_dtype(value: np.ndarray) -> dtypes.DType: - """ - Infer dtype of a numpy array - """ - if value.ndim == 3: - if value.shape[-1] in (1, 3, 4): - return dtypes.image_dtype - elif value.ndim == 2: - if value.shape[0] == 2 or value.shape[1] == 2: - return dtypes.sequence_1d_dtype - elif value.ndim == 1: - if len(value) == 2: - return dtypes.window_dtype - return dtypes.embedding_dtype - return dtypes.array_dtype - - -def infer_dtypes(df: pd.DataFrame, dtype: Optional[dtypes.DTypeMap]) -> dtypes.DTypeMap: - """ - Check column types from the given `dtype` and complete it with auto inferred - column types for the given `pandas.DataFrame`. - """ - inferred_dtype = dtype or {} - for column_index in df: - if column_index not in inferred_dtype: - try: - column_type = infer_dtype(df[column_index]) - except UnsupportedDType: - column_type = dtypes.str_dtype - inferred_dtype[str(column_index)] = column_type - return inferred_dtype - - -def is_string_mask(column: pd.Series) -> pd.Series: - """ - Return mask of column's elements of type string. - """ - if len(column) == 0: - return pd.Series([], dtype=bool) - return column.map(type) == str - - -def to_categorical(column: pd.Series, str_categories: bool = False) -> pd.Series: - """ - Convert a `pandas` column to categorical dtype. - - Args: - column: A `pandas` column. - str_categories: Replace all categories with their string representations. - - Returns: - categorical `pandas` column. - """ - column = column.mask(column.isna(), None).astype("category") # type: ignore - if str_categories: - return column.cat.rename_categories(column.cat.categories.astype(str)) - return column - - -def prepare_hugging_face_dict(x: Dict) -> Any: - """ - Prepare HuggingFace format for files to be used in Spotlight. - """ - if x.keys() != {"bytes", "path"}: - return x - blob = x["bytes"] - if blob is not None: - return blob - return x["path"] - - -def prepare_column(column: pd.Series, dtype: dtypes.DType) -> pd.Series: - """ - Convert a `pandas` column to the desired `dtype` and prepare some values, - but still as `pandas` column. - - Args: - column: A `pandas` column to prepare. - dtype: Target data type. - - Returns: - Prepared `pandas` column. - - Raises: - TypeError: If `dtype` is not a Spotlight data type. - """ - column = column.copy() - - if dtypes.is_category_dtype(dtype): - # We only support string/`NA` categories, but `pandas` can more, so - # force categories to be strings (does not affect `NA`s). - return to_categorical(column, str_categories=True) - - if dtypes.is_datetime_dtype(dtype): - # `errors="coerce"` will produce `NaT`s instead of fail. - return pd.to_datetime(column, errors="coerce") - - if dtypes.is_str_dtype(dtype): - # Allow `NA`s, convert all other elements to strings. - return column.astype(str).mask(column.isna(), None) # type: ignore - - if dtypes.is_bool_dtype(dtype): - return column.astype(bool) - - if dtypes.is_int_dtype(dtype): - return column.astype(int) - - if dtypes.is_float_dtype(dtype): - return column.astype(float) - - # We explicitely don't want to change the original `DataFrame`. - with pd.option_context("mode.chained_assignment", None): - # We consider empty strings as `NA`s. - str_mask = is_string_mask(column) - column[str_mask] = column[str_mask].replace("", None) - na_mask = column.isna() - - # When `pandas` reads a csv, arrays and lists are read as literal strings, - # try to interpret them. - str_mask = is_string_mask(column) - column[str_mask] = column[str_mask].apply(try_literal_eval) - - if dtypes.is_filebased_dtype(dtype): - dict_mask = column.map(type) == dict - column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict) - - return column.mask(na_mask, None) # type: ignore diff --git a/renumics/spotlight_plugins/core/pandas_data_source.py b/renumics/spotlight_plugins/core/pandas_data_source.py index 3eee6ebe..4404a14d 100644 --- a/renumics/spotlight_plugins/core/pandas_data_source.py +++ b/renumics/spotlight_plugins/core/pandas_data_source.py @@ -9,11 +9,7 @@ import datasets from renumics.spotlight import dtypes -from renumics.spotlight.io.pandas import ( - prepare_hugging_face_dict, - stringify_columns, - try_literal_eval, -) +from renumics.spotlight.io import prepare_hugging_face_dict, try_literal_eval from renumics.spotlight.data_source import ( datasource, ColumnMetadata, @@ -107,7 +103,7 @@ def __init__(self, source: Union[Path, pd.DataFrame]): @property def column_names(self) -> List[str]: - return stringify_columns(self._df) + return [str(column) for column in self._df.columns] @property def df(self) -> pd.DataFrame: diff --git a/tests/integration/dataset/test_dataset.py b/tests/integration/dataset/test_dataset.py index 10f7ed75..fe28c8fc 100644 --- a/tests/integration/dataset/test_dataset.py +++ b/tests/integration/dataset/test_dataset.py @@ -26,7 +26,7 @@ from renumics.spotlight.dataset import escape_dataset_name, unescape_dataset_name from renumics.spotlight import dtypes from renumics.spotlight.dataset.typing import OutputType -from renumics.spotlight.io.pandas import infer_dtype +from renumics.spotlight.dataset.pandas import infer_dtype from .conftest import ColumnData from .helpers import get_append_column_fn_name from ..helpers import approx From a4d95a2b7dcce013c93bb287f1807f7023c169ca Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Mon, 23 Oct 2023 13:41:16 +0200 Subject: [PATCH 6/8] Track forgotten files --- renumics/spotlight/dataset/pandas.py | 275 +++++++++++++++++++++++++++ renumics/spotlight/io/huggingface.py | 16 ++ 2 files changed, 291 insertions(+) create mode 100644 renumics/spotlight/dataset/pandas.py create mode 100644 renumics/spotlight/io/huggingface.py diff --git a/renumics/spotlight/dataset/pandas.py b/renumics/spotlight/dataset/pandas.py new file mode 100644 index 00000000..dfa133e3 --- /dev/null +++ b/renumics/spotlight/dataset/pandas.py @@ -0,0 +1,275 @@ +import os.path +import statistics +from typing import Any, Optional, Sequence, Union + +import PIL.Image +import filetype +import numpy as np +import pandas as pd +import trimesh + +from renumics.spotlight import dtypes +from renumics.spotlight.io import prepare_hugging_face_dict, try_literal_eval +from renumics.spotlight.media import Audio, Embedding, Image, Mesh, Sequence1D, Video +from renumics.spotlight.typing import is_iterable, is_pathtype +from .exceptions import InvalidDTypeError + + +def create_typed_series( + dtype: dtypes.DType, values: Optional[Union[Sequence, np.ndarray]] = None +) -> pd.Series: + if dtypes.is_category_dtype(dtype): + if values is None or len(values) == 0: + return pd.Series( + dtype=pd.CategoricalDtype( + [] if not dtype.categories else list(dtype.categories.keys()) + ) + ) + if dtype.inverted_categories is None: + return pd.Series([None] * len(values), dtype=pd.CategoricalDtype()) + return pd.Series( + [dtype.inverted_categories.get(code) for code in values], + dtype=pd.CategoricalDtype(), + ) + if dtypes.is_bool_dtype(dtype): + pandas_dtype = "boolean" + elif dtypes.is_int_dtype(dtype): + pandas_dtype = "Int64" + elif dtypes.is_float_dtype(dtype): + pandas_dtype = "float" + elif dtypes.is_str_dtype(dtype): + pandas_dtype = "string" + elif dtypes.is_datetime_dtype(dtype): + pandas_dtype = "datetime64[ns]" + else: + pandas_dtype = "object" + return pd.Series([] if values is None else values, dtype=pandas_dtype) + + +def prepare_column(column: pd.Series, dtype: dtypes.DType) -> pd.Series: + """ + Convert a `pandas` column to the desired `dtype` and prepare some values, + but still as `pandas` column. + + Args: + column: A `pandas` column to prepare. + dtype: Target data type. + + Returns: + Prepared `pandas` column. + + Raises: + TypeError: If `dtype` is not a Spotlight data type. + """ + column = column.copy() + + if dtypes.is_category_dtype(dtype): + # We only support string/`NA` categories, but `pandas` can more, so + # force categories to be strings (does not affect `NA`s). + return to_categorical(column, str_categories=True) + + if dtypes.is_datetime_dtype(dtype): + # `errors="coerce"` will produce `NaT`s instead of fail. + return pd.to_datetime(column, errors="coerce") + + if dtypes.is_str_dtype(dtype): + # Allow `NA`s, convert all other elements to strings. + return column.astype(str).mask(column.isna(), None) # type: ignore + + if dtypes.is_bool_dtype(dtype): + return column.astype(bool) + + if dtypes.is_int_dtype(dtype): + return column.astype(int) + + if dtypes.is_float_dtype(dtype): + return column.astype(float) + + # We explicitely don't want to change the original `DataFrame`. + with pd.option_context("mode.chained_assignment", None): + # We consider empty strings as `NA`s. + str_mask = is_string_mask(column) + column[str_mask] = column[str_mask].replace("", None) + na_mask = column.isna() + + # When `pandas` reads a csv, arrays and lists are read as literal strings, + # try to interpret them. + str_mask = is_string_mask(column) + column[str_mask] = column[str_mask].apply(try_literal_eval) + + if dtypes.is_filebased_dtype(dtype): + dict_mask = column.map(type) == dict + column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict) + + return column.mask(na_mask, None) # type: ignore + + +def infer_dtype(column: pd.Series) -> dtypes.DType: + """ + Get an equivalent Spotlight data type for a `pandas` column, if possible. + + At the moment, only scalar data types can be inferred. + + Nullable boolean and integer `pandas` dtypes have no equivalent Spotlight + data type and will be read as strings. + + Float, string, and category data types are allowed to have `NaN`s. + + Args: + column: A `pandas` column to infer dtype from. + + Returns: + Inferred dtype. + + Raises: + ValueError: If dtype cannot be inferred automatically. + """ + + if pd.api.types.is_bool_dtype(column): + return dtypes.bool_dtype + if pd.api.types.is_categorical_dtype(column): + return dtypes.CategoryDType( + {category: code for code, category in enumerate(column.cat.categories)} + ) + if pd.api.types.is_integer_dtype(column): + return dtypes.int_dtype + if pd.api.types.is_float_dtype(column): + return dtypes.float_dtype + if pd.api.types.is_datetime64_any_dtype(column): + return dtypes.datetime_dtype + + column = column.copy() + str_mask = is_string_mask(column) + column[str_mask] = column[str_mask].replace("", None) + + column = column[~column.isna()] + if len(column) == 0: + return dtypes.str_dtype + + column_head = column.iloc[:10] + head_dtypes = column_head.apply(infer_value_dtype).to_list() # type: ignore + dtype_mode = statistics.mode(head_dtypes) + + if dtype_mode is None: + return dtypes.str_dtype + if dtype_mode in [dtypes.window_dtype, dtypes.embedding_dtype]: + column = column.astype(object) + str_mask = is_string_mask(column) + x = column[str_mask].apply(try_literal_eval) + column[str_mask] = x + dict_mask = column.map(type) == dict + column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict) + try: + np.asarray(column.to_list(), dtype=float) + except (TypeError, ValueError): + return dtypes.sequence_1d_dtype + return dtype_mode + return dtype_mode + + +def infer_value_dtype(value: Any) -> Optional[dtypes.DType]: + """ + Infer dtype for value + """ + if isinstance(value, Embedding): + return dtypes.embedding_dtype + if isinstance(value, Sequence1D): + return dtypes.sequence_1d_dtype + if isinstance(value, Image): + return dtypes.image_dtype + if isinstance(value, Audio): + return dtypes.audio_dtype + if isinstance(value, Video): + return dtypes.video_dtype + if isinstance(value, Mesh): + return dtypes.mesh_dtype + if isinstance(value, PIL.Image.Image): + return dtypes.image_dtype + if isinstance(value, trimesh.Trimesh): + return dtypes.mesh_dtype + if isinstance(value, np.ndarray): + return infer_array_dtype(value) + + # When `pandas` reads a csv, arrays and lists are read as literal strings, + # try to interpret them. + value = try_literal_eval(value) + if isinstance(value, dict): + value = prepare_hugging_face_dict(value) + if isinstance(value, bytes) or (is_pathtype(value) and os.path.isfile(value)): + kind = filetype.guess(value) + if kind is not None: + mime_group = kind.mime.split("/")[0] + if mime_group == "image": + return dtypes.image_dtype + if mime_group == "audio": + return dtypes.audio_dtype + if mime_group == "video": + return dtypes.video_dtype + return None + if is_iterable(value): + try: + value = np.asarray(value, dtype=float) + except (TypeError, ValueError): + pass + else: + return infer_array_dtype(value) + return None + + +def infer_array_dtype(value: np.ndarray) -> dtypes.DType: + """ + Infer dtype of a numpy array + """ + if value.ndim == 3: + if value.shape[-1] in (1, 3, 4): + return dtypes.image_dtype + elif value.ndim == 2: + if value.shape[0] == 2 or value.shape[1] == 2: + return dtypes.sequence_1d_dtype + elif value.ndim == 1: + if len(value) == 2: + return dtypes.window_dtype + return dtypes.embedding_dtype + return dtypes.array_dtype + + +def infer_dtypes(df: pd.DataFrame, dtype: Optional[dtypes.DTypeMap]) -> dtypes.DTypeMap: + """ + Check column types from the given `dtype` and complete it with auto inferred + column types for the given `pandas.DataFrame`. + """ + inferred_dtype = dtype or {} + for column_index in df: + if column_index not in inferred_dtype: + try: + column_type = infer_dtype(df[column_index]) + except InvalidDTypeError: + column_type = dtypes.str_dtype + inferred_dtype[str(column_index)] = column_type + return inferred_dtype + + +def is_string_mask(column: pd.Series) -> pd.Series: + """ + Return mask of column's elements of type string. + """ + if len(column) == 0: + return pd.Series([], dtype=bool) + return column.map(type) == str + + +def to_categorical(column: pd.Series, str_categories: bool = False) -> pd.Series: + """ + Convert a `pandas` column to categorical dtype. + + Args: + column: A `pandas` column. + str_categories: Replace all categories with their string representations. + + Returns: + categorical `pandas` column. + """ + column = column.mask(column.isna(), None).astype("category") # type: ignore + if str_categories: + return column.cat.rename_categories(column.cat.categories.astype(str)) + return column diff --git a/renumics/spotlight/io/huggingface.py b/renumics/spotlight/io/huggingface.py new file mode 100644 index 00000000..06c0d441 --- /dev/null +++ b/renumics/spotlight/io/huggingface.py @@ -0,0 +1,16 @@ +""" +Helpers for HuggingFace formats. +""" +from typing import Any, Dict + + +def prepare_hugging_face_dict(x: Dict) -> Any: + """ + Prepare HuggingFace format for files to be used in Spotlight. + """ + if x.keys() != {"bytes", "path"}: + return x + blob = x["bytes"] + if blob is not None: + return blob + return x["path"] From d172873800a820e32092316198efa4d2da73a44c Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Mon, 23 Oct 2023 13:57:08 +0200 Subject: [PATCH 7/8] Add docstring --- renumics/spotlight/dataset/pandas.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/renumics/spotlight/dataset/pandas.py b/renumics/spotlight/dataset/pandas.py index dfa133e3..75ccf00f 100644 --- a/renumics/spotlight/dataset/pandas.py +++ b/renumics/spotlight/dataset/pandas.py @@ -1,3 +1,7 @@ +""" +Helper for conversion between H5 dataset and `pandas.DataFrame`. +""" + import os.path import statistics from typing import Any, Optional, Sequence, Union From 0ef701f654d2c0de14f76d9658620d6f45597624 Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Tue, 24 Oct 2023 07:54:04 +0200 Subject: [PATCH 8/8] Formal md files --- CONTRIBUTING.md | 8 ++++---- README.md | 15 +++++++-------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d6628c4f..e71d164e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,10 +11,10 @@ Technical details on how to contribute can be found in our [documentation](https There are several ways you can contribute to Spotlight: -* Fix outstanding issues. -* Implement new features. -* Submit issues related to bugs or desired new features. -* Share your use case +- Fix outstanding issues. +- Implement new features. +- Submit issues related to bugs or desired new features. +- Share your use case If you don't know where to start, you might want to have a look at [hacktoberfest issues](https://github.com/Renumics/spotlight/issues?q=is%3Aissue+is%3Aopen+label%3Ahacktoberfest) and our guide on how to create a [new Lens](https://renumics.com/docs/development/lenses). diff --git a/README.md b/README.md index 6c68d669..320f9e13 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,10 @@

-Spotlight helps you to **understand unstructured datasets** fast. You can quickly create **interactive visualizations** and leverage data enrichments (e.g. embeddings, prediction, uncertainties) to **identify critical clusters** in your data. +Spotlight helps you to **understand unstructured datasets** fast. You can quickly create **interactive visualizations** and leverage data enrichments (e.g. embeddings, prediction, uncertainties) to **identify critical clusters** in your data. Spotlight supports most unstructured data types including **images, audio, text, videos, time-series and geometric data**. You can start from your existing dataframe: +

And start Spotlight with just a few lines of code: @@ -49,7 +50,7 @@ Machine learning and engineering teams use Spotlight to understand and communica [Classification] Find Issues in Any Image Classification Dataset 👨‍đŸ’ģ 📝 🕹ī¸ - + Find data issues in the CIFAR-100 image dataset 🕹ī¸ @@ -91,7 +92,6 @@ Machine learning and engineering teams use Spotlight to understand and communica - ## ⏱ī¸ Quickstart Get started by installing Spotlight and loading your first dataset. @@ -132,12 +132,11 @@ ds = datasets.load_dataset('renumics/emodb-enriched', split='all') layout= spotlight.layouts.debug_classification(label='gender', prediction='m1_gender_prediction', embedding='m1_embedding', features=['age', 'emotion']) spotlight.show(ds, layout=layout) ``` + Here, the data types are discovered automatically from the dataset and we use a pre-defined layout for model debugging. Custom layouts can be built programmatically or via the UI. > The `datasets[audio]` package can be installed via pip. - - #### Usage Tracking We have added crash report and performance collection. We do NOT collect user data other than an anonymized Machine Id obtained by py-machineid, and only log our own actions. We do NOT collect folder names, dataset names, or row data of any kind only aggregate performance statistics like total time of a table_load, crash data, etc. Collecting Spotlight crashes will help us improve stability. To opt out of the crash report collection define an environment variable called `SPOTLIGHT_OPT_OUT` and set it to true. e.G.`export SPOTLIGHT_OPT_OUT=true` @@ -150,9 +149,9 @@ We have added crash report and performance collection. We do NOT collect user da ## Learn more about unstructured data workflows -- 🤗 [Huggingface](https://huggingface.co/renumics) example spaces and datasets -- 🏀 [Playbook](https://renumics.com/docs/playbook/) for data-centric AI workflows -- 🍰 [Sliceguard](https://github.com/Renumics/sliceguard) library for automatic slice detection +- 🤗 [Huggingface](https://huggingface.co/renumics) example spaces and datasets +- 🏀 [Playbook](https://renumics.com/docs/playbook/) for data-centric AI workflows +- 🍰 [Sliceguard](https://github.com/Renumics/sliceguard) library for automatic slice detection ## Contribute