diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d6628c4f..e71d164e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,10 +11,10 @@ Technical details on how to contribute can be found in our [documentation](https There are several ways you can contribute to Spotlight: -* Fix outstanding issues. -* Implement new features. -* Submit issues related to bugs or desired new features. -* Share your use case +- Fix outstanding issues. +- Implement new features. +- Submit issues related to bugs or desired new features. +- Share your use case If you don't know where to start, you might want to have a look at [hacktoberfest issues](https://github.com/Renumics/spotlight/issues?q=is%3Aissue+is%3Aopen+label%3Ahacktoberfest) and our guide on how to create a [new Lens](https://renumics.com/docs/development/lenses). diff --git a/README.md b/README.md index 6c68d669..320f9e13 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,10 @@

-Spotlight helps you to **understand unstructured datasets** fast. You can quickly create **interactive visualizations** and leverage data enrichments (e.g. embeddings, prediction, uncertainties) to **identify critical clusters** in your data. +Spotlight helps you to **understand unstructured datasets** fast. You can quickly create **interactive visualizations** and leverage data enrichments (e.g. embeddings, prediction, uncertainties) to **identify critical clusters** in your data. Spotlight supports most unstructured data types including **images, audio, text, videos, time-series and geometric data**. You can start from your existing dataframe: +

And start Spotlight with just a few lines of code: @@ -49,7 +50,7 @@ Machine learning and engineering teams use Spotlight to understand and communica [Classification] Find Issues in Any Image Classification Dataset 👨‍đŸ’ģ 📝 🕹ī¸ - + Find data issues in the CIFAR-100 image dataset 🕹ī¸ @@ -91,7 +92,6 @@ Machine learning and engineering teams use Spotlight to understand and communica - ## ⏱ī¸ Quickstart Get started by installing Spotlight and loading your first dataset. @@ -132,12 +132,11 @@ ds = datasets.load_dataset('renumics/emodb-enriched', split='all') layout= spotlight.layouts.debug_classification(label='gender', prediction='m1_gender_prediction', embedding='m1_embedding', features=['age', 'emotion']) spotlight.show(ds, layout=layout) ``` + Here, the data types are discovered automatically from the dataset and we use a pre-defined layout for model debugging. Custom layouts can be built programmatically or via the UI. > The `datasets[audio]` package can be installed via pip. - - #### Usage Tracking We have added crash report and performance collection. We do NOT collect user data other than an anonymized Machine Id obtained by py-machineid, and only log our own actions. We do NOT collect folder names, dataset names, or row data of any kind only aggregate performance statistics like total time of a table_load, crash data, etc. Collecting Spotlight crashes will help us improve stability. To opt out of the crash report collection define an environment variable called `SPOTLIGHT_OPT_OUT` and set it to true. e.G.`export SPOTLIGHT_OPT_OUT=true` @@ -150,9 +149,9 @@ We have added crash report and performance collection. We do NOT collect user da ## Learn more about unstructured data workflows -- 🤗 [Huggingface](https://huggingface.co/renumics) example spaces and datasets -- 🏀 [Playbook](https://renumics.com/docs/playbook/) for data-centric AI workflows -- 🍰 [Sliceguard](https://github.com/Renumics/sliceguard) library for automatic slice detection +- 🤗 [Huggingface](https://huggingface.co/renumics) example spaces and datasets +- 🏀 [Playbook](https://renumics.com/docs/playbook/) for data-centric AI workflows +- 🍰 [Sliceguard](https://github.com/Renumics/sliceguard) library for automatic slice detection ## Contribute diff --git a/renumics/spotlight/data_source/data_source.py b/renumics/spotlight/data_source/data_source.py index d06133f3..695acb5d 100644 --- a/renumics/spotlight/data_source/data_source.py +++ b/renumics/spotlight/data_source/data_source.py @@ -6,7 +6,6 @@ import pandas as pd import numpy as np -from pydantic.dataclasses import dataclass from renumics.spotlight.dataset.exceptions import ( ColumnExistsError, @@ -30,17 +29,6 @@ class ColumnMetadata: tags: List[str] = dataclasses.field(default_factory=list) -@dataclass -class CellsUpdate: - """ - A dataset's cell update. - """ - - value: Any - author: str - edited_at: str - - class DataSource(ABC): """abstract base class for different data sources""" @@ -61,7 +49,7 @@ def column_names(self) -> List[str]: @abstractmethod def intermediate_dtypes(self) -> DTypeMap: """ - The dtypes of intermediate values + The dtypes of intermediate values. Values for all columns must be filled. """ @property @@ -94,7 +82,7 @@ def check_generation_id(self, generation_id: int) -> None: @abstractmethod def semantic_dtypes(self) -> DTypeMap: """ - Semantic dtypes for viewer. + Semantic dtypes for viewer. Some values may be not present. """ @abstractmethod diff --git a/renumics/spotlight/data_store.py b/renumics/spotlight/data_store.py index 2eb0108b..ba940254 100644 --- a/renumics/spotlight/data_store.py +++ b/renumics/spotlight/data_store.py @@ -21,13 +21,16 @@ DType, DTypeMap, EmbeddingDType, + array_dtype, is_array_dtype, is_audio_dtype, is_category_dtype, + is_embedding_dtype, is_file_dtype, is_str_dtype, is_mixed_dtype, is_bytes_dtype, + is_window_dtype, str_dtype, audio_dtype, image_dtype, @@ -173,33 +176,32 @@ def _guess_dtype(self, col: str) -> DType: return semantic_dtype sample_values = self._data_source.get_column_values(col, slice(10)) - sample_dtypes = [_guess_value_dtype(value) for value in sample_values] - - try: - mode_dtype = statistics.mode(sample_dtypes) - except statistics.StatisticsError: + sample_dtypes: List[DType] = [] + for value in sample_values: + guessed_dtype = _guess_value_dtype(value) + if guessed_dtype is not None: + sample_dtypes.append(guessed_dtype) + if not sample_dtypes: return semantic_dtype - return mode_dtype or semantic_dtype + mode_dtype = statistics.mode(sample_dtypes) + # For windows and embeddings, at least sample values must be aligned. + if is_window_dtype(mode_dtype) and any( + not is_window_dtype(dtype) for dtype in sample_dtypes + ): + return array_dtype + if is_embedding_dtype(mode_dtype) and any( + (not is_embedding_dtype(dtype)) or dtype.length != mode_dtype.length + for dtype in sample_dtypes + ): + return array_dtype + + return mode_dtype def _intermediate_to_semantic_dtype(intermediate_dtype: DType) -> DType: if is_array_dtype(intermediate_dtype): - if intermediate_dtype.shape is None: - return intermediate_dtype - if intermediate_dtype.shape == (2,): - return window_dtype - if intermediate_dtype.ndim == 1 and intermediate_dtype.shape[0] is not None: - return EmbeddingDType(intermediate_dtype.shape[0]) - if intermediate_dtype.ndim == 1 and intermediate_dtype.shape[0] is None: - return sequence_1d_dtype - if intermediate_dtype.ndim == 2 and ( - intermediate_dtype.shape[0] == 2 or intermediate_dtype.shape[1] == 2 - ): - return sequence_1d_dtype - if intermediate_dtype.ndim == 3 and intermediate_dtype.shape[-1] in (1, 3, 4): - return image_dtype - return intermediate_dtype + return _guess_array_dtype(intermediate_dtype) if is_file_dtype(intermediate_dtype): return str_dtype if is_mixed_dtype(intermediate_dtype): @@ -262,5 +264,21 @@ def _guess_value_dtype(value: Any) -> Optional[DType]: except (TypeError, ValueError): pass else: - return ArrayDType(value.shape) + return _guess_array_dtype(ArrayDType(value.shape)) return None + + +def _guess_array_dtype(dtype: ArrayDType) -> DType: + if dtype.shape is None: + return dtype + if dtype.shape == (2,): + return window_dtype + if dtype.ndim == 1 and dtype.shape[0] is not None: + return EmbeddingDType(dtype.shape[0]) + if dtype.ndim == 1 and dtype.shape[0] is None: + return sequence_1d_dtype + if dtype.ndim == 2 and (dtype.shape[0] == 2 or dtype.shape[1] == 2): + return sequence_1d_dtype + if dtype.ndim == 3 and dtype.shape[-1] in (1, 3, 4): + return image_dtype + return dtype diff --git a/renumics/spotlight/dataset/__init__.py b/renumics/spotlight/dataset/__init__.py index 99828ddf..e586499d 100644 --- a/renumics/spotlight/dataset/__init__.py +++ b/renumics/spotlight/dataset/__init__.py @@ -32,12 +32,7 @@ from typing_extensions import TypeGuard from renumics.spotlight.__version__ import __version__ -from renumics.spotlight.io.pandas import ( - infer_dtypes, - prepare_column, - is_string_mask, - stringify_columns, -) +from .pandas import create_typed_series, infer_dtypes, is_string_mask, prepare_column from renumics.spotlight.typing import ( BoolType, IndexType, @@ -47,7 +42,6 @@ is_integer, is_iterable, ) -from renumics.spotlight.io.pandas import create_typed_series from renumics.spotlight.dtypes.conversion import prepare_path_or_url from renumics.spotlight import dtypes as spotlight_dtypes @@ -738,7 +732,7 @@ def from_pandas( df = df.reset_index(level=df.index.names) # type: ignore else: df = df.copy() - df.columns = pd.Index(stringify_columns(df)) + df.columns = pd.Index([str(column) for column in df.columns]) if dtypes is None: dtypes = {} diff --git a/renumics/spotlight/io/pandas.py b/renumics/spotlight/dataset/pandas.py similarity index 86% rename from renumics/spotlight/io/pandas.py rename to renumics/spotlight/dataset/pandas.py index 4cf84f9e..75ccf00f 100644 --- a/renumics/spotlight/io/pandas.py +++ b/renumics/spotlight/dataset/pandas.py @@ -1,30 +1,22 @@ """ -This module contains helpers for importing `pandas.DataFrame`s. +Helper for conversion between H5 dataset and `pandas.DataFrame`. """ -import ast import os.path import statistics -from contextlib import suppress -from typing import Any, Dict, List, Optional, Sequence, Union +from typing import Any, Optional, Sequence, Union import PIL.Image import filetype -import trimesh import numpy as np import pandas as pd +import trimesh -from renumics.spotlight.dtypes import ( - Audio, - Embedding, - Image, - Mesh, - Sequence1D, - Video, -) -from renumics.spotlight.media.exceptions import UnsupportedDType -from renumics.spotlight.typing import is_iterable, is_pathtype from renumics.spotlight import dtypes +from renumics.spotlight.io import prepare_hugging_face_dict, try_literal_eval +from renumics.spotlight.media import Audio, Embedding, Image, Mesh, Sequence1D, Video +from renumics.spotlight.typing import is_iterable, is_pathtype +from .exceptions import InvalidDTypeError def create_typed_series( @@ -58,32 +50,62 @@ def create_typed_series( return pd.Series([] if values is None else values, dtype=pandas_dtype) -def is_empty(value: Any) -> bool: - """ - Check if value is `NA` or an empty string. +def prepare_column(column: pd.Series, dtype: dtypes.DType) -> pd.Series: """ - if is_iterable(value): - # `pd.isna` with an iterable argument returns an iterable result. But - # an iterable cannot be NA or empty string by default. - return False - return pd.isna(value) or value == "" + Convert a `pandas` column to the desired `dtype` and prepare some values, + but still as `pandas` column. + + Args: + column: A `pandas` column to prepare. + dtype: Target data type. + Returns: + Prepared `pandas` column. -def try_literal_eval(x: str) -> Any: - """ - Try to evaluate a literal expression, otherwise return value as is. + Raises: + TypeError: If `dtype` is not a Spotlight data type. """ - with suppress(Exception): - return ast.literal_eval(x) - return x + column = column.copy() + if dtypes.is_category_dtype(dtype): + # We only support string/`NA` categories, but `pandas` can more, so + # force categories to be strings (does not affect `NA`s). + return to_categorical(column, str_categories=True) -def stringify_columns(df: pd.DataFrame) -> List[str]: - """ - Convert `pandas.DataFrame`'s column names to strings, no matter which index - is used. - """ - return [str(column_name) for column_name in df.columns] + if dtypes.is_datetime_dtype(dtype): + # `errors="coerce"` will produce `NaT`s instead of fail. + return pd.to_datetime(column, errors="coerce") + + if dtypes.is_str_dtype(dtype): + # Allow `NA`s, convert all other elements to strings. + return column.astype(str).mask(column.isna(), None) # type: ignore + + if dtypes.is_bool_dtype(dtype): + return column.astype(bool) + + if dtypes.is_int_dtype(dtype): + return column.astype(int) + + if dtypes.is_float_dtype(dtype): + return column.astype(float) + + # We explicitely don't want to change the original `DataFrame`. + with pd.option_context("mode.chained_assignment", None): + # We consider empty strings as `NA`s. + str_mask = is_string_mask(column) + column[str_mask] = column[str_mask].replace("", None) + na_mask = column.isna() + + # When `pandas` reads a csv, arrays and lists are read as literal strings, + # try to interpret them. + str_mask = is_string_mask(column) + column[str_mask] = column[str_mask].apply(try_literal_eval) + + if dtypes.is_filebased_dtype(dtype): + dict_mask = column.map(type) == dict + column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict) + + return column.mask(na_mask, None) # type: ignore def infer_dtype(column: pd.Series) -> dtypes.DType: @@ -225,7 +247,7 @@ def infer_dtypes(df: pd.DataFrame, dtype: Optional[dtypes.DTypeMap]) -> dtypes.D if column_index not in inferred_dtype: try: column_type = infer_dtype(df[column_index]) - except UnsupportedDType: + except InvalidDTypeError: column_type = dtypes.str_dtype inferred_dtype[str(column_index)] = column_type return inferred_dtype @@ -255,73 +277,3 @@ def to_categorical(column: pd.Series, str_categories: bool = False) -> pd.Series if str_categories: return column.cat.rename_categories(column.cat.categories.astype(str)) return column - - -def prepare_hugging_face_dict(x: Dict) -> Any: - """ - Prepare HuggingFace format for files to be used in Spotlight. - """ - if x.keys() != {"bytes", "path"}: - return x - blob = x["bytes"] - if blob is not None: - return blob - return x["path"] - - -def prepare_column(column: pd.Series, dtype: dtypes.DType) -> pd.Series: - """ - Convert a `pandas` column to the desired `dtype` and prepare some values, - but still as `pandas` column. - - Args: - column: A `pandas` column to prepare. - dtype: Target data type. - - Returns: - Prepared `pandas` column. - - Raises: - TypeError: If `dtype` is not a Spotlight data type. - """ - column = column.copy() - - if dtypes.is_category_dtype(dtype): - # We only support string/`NA` categories, but `pandas` can more, so - # force categories to be strings (does not affect `NA`s). - return to_categorical(column, str_categories=True) - - if dtypes.is_datetime_dtype(dtype): - # `errors="coerce"` will produce `NaT`s instead of fail. - return pd.to_datetime(column, errors="coerce") - - if dtypes.is_str_dtype(dtype): - # Allow `NA`s, convert all other elements to strings. - return column.astype(str).mask(column.isna(), None) # type: ignore - - if dtypes.is_bool_dtype(dtype): - return column.astype(bool) - - if dtypes.is_int_dtype(dtype): - return column.astype(int) - - if dtypes.is_float_dtype(dtype): - return column.astype(float) - - # We explicitely don't want to change the original `DataFrame`. - with pd.option_context("mode.chained_assignment", None): - # We consider empty strings as `NA`s. - str_mask = is_string_mask(column) - column[str_mask] = column[str_mask].replace("", None) - na_mask = column.isna() - - # When `pandas` reads a csv, arrays and lists are read as literal strings, - # try to interpret them. - str_mask = is_string_mask(column) - column[str_mask] = column[str_mask].apply(try_literal_eval) - - if dtypes.is_filebased_dtype(dtype): - dict_mask = column.map(type) == dict - column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict) - - return column.mask(na_mask, None) # type: ignore diff --git a/renumics/spotlight/dtypes/__init__.py b/renumics/spotlight/dtypes/__init__.py index 0e24ea10..63910215 100644 --- a/renumics/spotlight/dtypes/__init__.py +++ b/renumics/spotlight/dtypes/__init__.py @@ -9,6 +9,8 @@ __all__ = [ "CategoryDType", + "ArrayDType", + "EmbeddingDType", "Sequence1DDType", "bool_dtype", "int_dtype", @@ -36,6 +38,14 @@ def __init__(self, name: str): def __str__(self) -> str: return self.name + def __eq__(self, other: Any) -> bool: + if isinstance(other, DType): + return other._name == self._name + return False + + def __hash__(self) -> int: + return hash(self._name) + @property def name(self) -> str: return self._name @@ -53,8 +63,10 @@ def __init__( self, categories: Optional[Union[Iterable[str], Dict[str, int]]] = None ): super().__init__("Category") - if isinstance(categories, dict) or categories is None: - self._categories = categories + if isinstance(categories, dict): + self._categories = dict(sorted(categories.items(), key=lambda x: x[1])) + elif categories is None: + self._categories = None else: self._categories = { category: code for code, category in enumerate(categories) @@ -71,6 +83,20 @@ def __init__( category: code for code, category in self._inverted_categories.items() } + def __eq__(self, other: Any) -> bool: + if isinstance(other, CategoryDType): + return other._categories == self._categories + return False + + def __hash__(self) -> int: + if self._categories is None: + return hash(self._name) ^ hash(None) + return ( + hash(self._name) + ^ hash(tuple(self._categories.keys())) + ^ hash(tuple(self._categories.values())) + ) + @property def categories(self) -> Optional[Dict[str, int]]: return self._categories @@ -91,6 +117,14 @@ def __init__(self, shape: Optional[Tuple[Optional[int], ...]] = None): super().__init__("array") self.shape = shape + def __eq__(self, other: Any) -> bool: + if isinstance(other, ArrayDType): + return other.shape == self.shape + return False + + def __hash__(self) -> int: + return hash(self._name) ^ hash(self.shape) + @property def ndim(self) -> int: if self.shape is None: @@ -111,6 +145,14 @@ def __init__(self, length: Optional[int] = None): raise ValueError(f"Length must be non-negative, but {length} received.") self.length = length + def __eq__(self, other: Any) -> bool: + if isinstance(other, EmbeddingDType): + return other.length == self.length + return False + + def __hash__(self) -> int: + return hash(self._name) ^ hash(self.length) + class Sequence1DDType(DType): """ @@ -125,6 +167,14 @@ def __init__(self, x_label: str = "x", y_label: str = "y"): self.x_label = x_label self.y_label = y_label + def __eq__(self, other: Any) -> bool: + if isinstance(other, Sequence1DDType): + return other.x_label == self.x_label and other.y_label == self.y_label + return False + + def __hash__(self) -> int: + return hash(self._name) ^ hash(self.x_label) ^ hash(self.y_label) + ALIASES: Dict[Any, DType] = {} diff --git a/renumics/spotlight/io/__init__.py b/renumics/spotlight/io/__init__.py index 2d2a6d26..8162a843 100644 --- a/renumics/spotlight/io/__init__.py +++ b/renumics/spotlight/io/__init__.py @@ -1,6 +1,9 @@ """ Reading and writing of different data formats. """ +import ast +from contextlib import suppress +from typing import Any from .audio import ( get_format_codec, @@ -19,6 +22,8 @@ decode_gltf_arrays, encode_gltf_array, ) +from .huggingface import prepare_hugging_face_dict + __all__ = [ "get_format_codec", @@ -34,4 +39,15 @@ "check_gltf", "decode_gltf_arrays", "encode_gltf_array", + "prepare_hugging_face_dict", + "try_literal_eval", ] + + +def try_literal_eval(x: str) -> Any: + """ + Try to evaluate a literal expression, otherwise return value as is. + """ + with suppress(Exception): + return ast.literal_eval(x) + return x diff --git a/renumics/spotlight/io/huggingface.py b/renumics/spotlight/io/huggingface.py new file mode 100644 index 00000000..06c0d441 --- /dev/null +++ b/renumics/spotlight/io/huggingface.py @@ -0,0 +1,16 @@ +""" +Helpers for HuggingFace formats. +""" +from typing import Any, Dict + + +def prepare_hugging_face_dict(x: Dict) -> Any: + """ + Prepare HuggingFace format for files to be used in Spotlight. + """ + if x.keys() != {"bytes", "path"}: + return x + blob = x["bytes"] + if blob is not None: + return blob + return x["path"] diff --git a/renumics/spotlight_plugins/core/pandas_data_source.py b/renumics/spotlight_plugins/core/pandas_data_source.py index 430a69ca..4404a14d 100644 --- a/renumics/spotlight_plugins/core/pandas_data_source.py +++ b/renumics/spotlight_plugins/core/pandas_data_source.py @@ -7,14 +7,9 @@ import numpy as np import pandas as pd import datasets -from renumics.spotlight import dtypes -from renumics.spotlight.io.pandas import ( - infer_dtype, - prepare_hugging_face_dict, - stringify_columns, - try_literal_eval, -) +from renumics.spotlight import dtypes +from renumics.spotlight.io import prepare_hugging_face_dict, try_literal_eval from renumics.spotlight.data_source import ( datasource, ColumnMetadata, @@ -23,7 +18,6 @@ from renumics.spotlight.backend.exceptions import DatasetColumnsNotUnique from renumics.spotlight.dataset.exceptions import ColumnNotExistsError from renumics.spotlight.data_source.exceptions import InvalidDataSource -from renumics.spotlight.dtypes import DTypeMap @datasource(pd.DataFrame) @@ -41,6 +35,7 @@ class PandasDataSource(DataSource): _uid: str _df: pd.DataFrame _name: str + _intermediate_dtypes: dtypes.DTypeMap def __init__(self, source: Union[Path, pd.DataFrame]): if isinstance(source, Path): @@ -108,7 +103,7 @@ def __init__(self, source: Union[Path, pd.DataFrame]): @property def column_names(self) -> List[str]: - return stringify_columns(self._df) + return [str(column) for column in self._df.columns] @property def df(self) -> pd.DataFrame: @@ -118,18 +113,15 @@ def df(self) -> pd.DataFrame: return self._df.copy() @property - def intermediate_dtypes(self) -> DTypeMap: + def intermediate_dtypes(self) -> dtypes.DTypeMap: return self._intermediate_dtypes def __len__(self) -> int: return len(self._df) @property - def semantic_dtypes(self) -> DTypeMap: - return { - str(column_name): infer_dtype(self.df[column_name]) - for column_name in self.df - } + def semantic_dtypes(self) -> dtypes.DTypeMap: + return {} def get_generation_id(self) -> int: return self._generation_id @@ -167,12 +159,14 @@ def get_column_values( if pd.api.types.is_categorical_dtype(column): return column.cat.codes if pd.api.types.is_string_dtype(column): - values = column.to_numpy() - na_mask = column.isna() - values[na_mask] = None - return values + column = column.astype(object).mask(column.isna(), None) + str_mask = column.map(type) == str + column[str_mask] = column[str_mask].apply(try_literal_eval) + dict_mask = column.map(type) == dict + column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict) + return column.to_numpy() if pd.api.types.is_object_dtype(column): - column = column.mask(column.isna(), None) + column = column.astype(object).mask(column.isna(), None) str_mask = column.map(type) == str column[str_mask] = column[str_mask].apply(try_literal_eval) dict_mask = column.map(type) == dict @@ -222,5 +216,4 @@ def _determine_intermediate_dtype(column: pd.Series) -> dtypes.DType: return dtypes.datetime_dtype if pd.api.types.is_string_dtype(column): return dtypes.str_dtype - else: - return dtypes.mixed_dtype + return dtypes.mixed_dtype diff --git a/tests/integration/dataset/test_dataset.py b/tests/integration/dataset/test_dataset.py index 10f7ed75..fe28c8fc 100644 --- a/tests/integration/dataset/test_dataset.py +++ b/tests/integration/dataset/test_dataset.py @@ -26,7 +26,7 @@ from renumics.spotlight.dataset import escape_dataset_name, unescape_dataset_name from renumics.spotlight import dtypes from renumics.spotlight.dataset.typing import OutputType -from renumics.spotlight.io.pandas import infer_dtype +from renumics.spotlight.dataset.pandas import infer_dtype from .conftest import ColumnData from .helpers import get_append_column_fn_name from ..helpers import approx