Merge branch 'main' into feature/errors-over-ws

Renumics · Oct 25, 2023 · a3cad35 · a3cad35
2 parents 480c900 + 0bb6ce5
commit a3cad35
Show file tree

Hide file tree

Showing 13 changed files with 237 additions and 201 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -11,10 +11,10 @@ Technical details on how to contribute can be found in our [documentation](https
 
 There are several ways you can contribute to Spotlight:
 
-* Fix outstanding issues.
-* Implement new features.
-* Submit issues related to bugs or desired new features.
-* Share your use case
+-   Fix outstanding issues.
+-   Implement new features.
+-   Submit issues related to bugs or desired new features.
+-   Share your use case
 
 If you don't know where to start, you might want to have a look at [hacktoberfest issues](https://github.com/Renumics/spotlight/issues?q=is%3Aissue+is%3Aopen+label%3Ahacktoberfest)
 and our guide on how to create a [new Lens](https://renumics.com/docs/development/lenses).
diff --git a/README.md b/README.md
@@ -17,9 +17,10 @@
 
 <p align="center"><a href="https://spotlight.renumics.com"><img src="static/img/spotlight_video.gif" width="100%"/></a></p>
 
-Spotlight helps you to **understand unstructured datasets** fast. You can quickly create **interactive visualizations** and leverage data enrichments (e.g. embeddings, prediction, uncertainties) to **identify critical clusters** in your data. 
+Spotlight helps you to **understand unstructured datasets** fast. You can quickly create **interactive visualizations** and leverage data enrichments (e.g. embeddings, prediction, uncertainties) to **identify critical clusters** in your data.
 
 Spotlight supports most unstructured data types including **images, audio, text, videos, time-series and geometric data**. You can start from your existing dataframe:
+
 <p align="left"><img src="static/img/dataframe_head_sample.png" width="100%"/></a></p>
 
 And start Spotlight with just a few lines of code:
@@ -49,7 +50,7 @@ Machine learning and engineering teams use Spotlight to understand and communica
 	    <td rowspan="3">[Classification]</td>
             <td>Find Issues in Any Image Classification Dataset</td>
             <td><a href="https://www.renumics.com/next/docs/use-cases/image-classification">👨‍💻</a> <a href="https://medium.com/@daniel-klitzke/finding-problematic-data-slices-in-unstructured-data-aeec0a3b9a2a">📝</a> <a href="https://huggingface.co/spaces/renumics/sliceguard-unstructured-data">🕹️</a></td>
-        </tr>	
+        </tr>
         <tr>
             <td>Find data issues in the CIFAR-100 image dataset</td>
             <td><a href="https://huggingface.co/spaces/renumics/navigate-data-issues">🕹️</a></td>
@@ -91,7 +92,6 @@ Machine learning and engineering teams use Spotlight to understand and communica
     </tbody>
 </table>
 
-
 ## ⏱️ Quickstart
 
 Get started by installing Spotlight and loading your first dataset.
@@ -132,12 +132,11 @@ ds = datasets.load_dataset('renumics/emodb-enriched', split='all')
 layout= spotlight.layouts.debug_classification(label='gender', prediction='m1_gender_prediction', embedding='m1_embedding', features=['age', 'emotion'])
 spotlight.show(ds, layout=layout)
 ```
+
 Here, the data types are discovered automatically from the dataset and we use a pre-defined layout for model debugging. Custom layouts can be built programmatically or via the UI.
 
 > The `datasets[audio]` package can be installed via pip.
 
-
-
 #### Usage Tracking
 
 We have added crash report and performance collection. We do NOT collect user data other than an anonymized Machine Id obtained by py-machineid, and only log our own actions. We do NOT collect folder names, dataset names, or row data of any kind only aggregate performance statistics like total time of a table_load, crash data, etc. Collecting Spotlight crashes will help us improve stability. To opt out of the crash report collection define an environment variable called `SPOTLIGHT_OPT_OUT` and set it to true. e.G.`export SPOTLIGHT_OPT_OUT=true`
@@ -150,9 +149,9 @@ We have added crash report and performance collection. We do NOT collect user da
 
 ## Learn more about unstructured data workflows
 
-- 🤗 [Huggingface](https://huggingface.co/renumics) example spaces and datasets 
-- 🏀 [Playbook](https://renumics.com/docs/playbook/) for data-centric AI workflows
-- 🍰 [Sliceguard](https://github.com/Renumics/sliceguard) library for automatic slice detection
+-   🤗 [Huggingface](https://huggingface.co/renumics) example spaces and datasets
+-   🏀 [Playbook](https://renumics.com/docs/playbook/) for data-centric AI workflows
+-   🍰 [Sliceguard](https://github.com/Renumics/sliceguard) library for automatic slice detection
 
 ## Contribute
 

diff --git a/renumics/spotlight/backend/tasks/reduction.py b/renumics/spotlight/backend/tasks/reduction.py
@@ -6,11 +6,9 @@
 
 import numpy as np
 import pandas as pd
-from sklearn import preprocessing
 
-from renumics.spotlight.dataset.exceptions import ColumnNotExistsError
 from renumics.spotlight.data_store import DataStore
-from renumics.spotlight.dtypes import is_category_dtype, is_embedding_dtype
+from renumics.spotlight import dtypes
 
 SEED = 42
 
@@ -27,6 +25,7 @@ def align_data(
     """
     Align data from table's columns, remove `NaN`'s.
     """
+    from sklearn import preprocessing
 
     if not column_names or not indices:
         return np.empty(0, np.float64), []
@@ -35,7 +34,7 @@ def align_data(
     for column_name in column_names:
         dtype = data_store.dtypes[column_name]
         column_values = data_store.get_converted_values(column_name, indices)
-        if is_embedding_dtype(dtype):
+        if dtypes.is_embedding_dtype(dtype):
             embedding_length = max(
                 0 if x is None else len(cast(np.ndarray, x)) for x in column_values
             )
@@ -49,17 +48,19 @@ def align_data(
                         ]
                     )
                 )
-        elif is_category_dtype(dtype):
+        elif dtypes.is_category_dtype(dtype):
             na_mask = np.array(column_values) == -1
             one_hot_values = preprocessing.label_binarize(
                 column_values, classes=sorted(set(column_values).difference({-1}))  # type: ignore
             ).astype(float)
             one_hot_values[na_mask] = np.nan
             aligned_values.append(one_hot_values)
-        elif dtype in (int, bool, float):
+        elif dtypes.is_scalar_dtype(dtype):
             aligned_values.append(np.array(column_values, dtype=float))
         else:
-            raise ColumnNotEmbeddable
+            raise ColumnNotEmbeddable(
+                f"Column '{column_name}' of type {dtype} is not embeddable."
+            )
 
     data = np.hstack([col.reshape((len(indices), -1)) for col in aligned_values])
     mask = ~pd.isna(data).any(axis=1)
@@ -78,10 +79,8 @@ def compute_umap(
     Prepare data from table and compute U-Map on them.
     """
 
-    try:
-        data, indices = align_data(data_store, column_names, indices)
-    except (ColumnNotExistsError, ColumnNotEmbeddable):
-        return np.empty(0, np.float64), []
+    data, indices = align_data(data_store, column_names, indices)
+
     if data.size == 0:
         return np.empty(0, np.float64), []
 
@@ -114,17 +113,20 @@ def compute_pca(
     Prepare data from table and compute PCA on them.
     """
 
-    from sklearn import preprocessing, decomposition
-
     data, indices = align_data(data_store, column_names, indices)
+
     if data.size == 0:
         return np.empty(0, np.float64), []
+
+    from sklearn import preprocessing, decomposition
+
     if data.shape[1] == 1:
         return np.hstack((data, np.zeros_like(data))), indices
     if normalization == "standardize":
         data = preprocessing.StandardScaler(copy=False).fit_transform(data)
     elif normalization == "robust standardize":
         data = preprocessing.RobustScaler(copy=False).fit_transform(data)
     reducer = decomposition.PCA(n_components=2, copy=False, random_state=SEED)
-    embeddings = reducer.fit_transform(data)
+    # `fit_transform` returns Fortran-ordered array.
+    embeddings = np.ascontiguousarray(reducer.fit_transform(data))
     return embeddings, indices
diff --git a/renumics/spotlight/data_source/data_source.py b/renumics/spotlight/data_source/data_source.py
@@ -6,7 +6,6 @@
 
 import pandas as pd
 import numpy as np
-from pydantic.dataclasses import dataclass
 
 from renumics.spotlight.dataset.exceptions import (
     ColumnExistsError,
@@ -30,17 +29,6 @@ class ColumnMetadata:
     tags: List[str] = dataclasses.field(default_factory=list)
 
 
-@dataclass
-class CellsUpdate:
-    """
-    A dataset's cell update.
-    """
-
-    value: Any
-    author: str
-    edited_at: str
-
-
 class DataSource(ABC):
     """abstract base class for different data sources"""
 
@@ -61,7 +49,7 @@ def column_names(self) -> List[str]:
     @abstractmethod
     def intermediate_dtypes(self) -> DTypeMap:
         """
-        The dtypes of intermediate values
+        The dtypes of intermediate values. Values for all columns must be filled.
         """
 
     @property
@@ -94,7 +82,7 @@ def check_generation_id(self, generation_id: int) -> None:
     @abstractmethod
     def semantic_dtypes(self) -> DTypeMap:
         """
-        Semantic dtypes for viewer.
+        Semantic dtypes for viewer. Some values may be not present.
         """
 
     @abstractmethod

diff --git a/renumics/spotlight/data_store.py b/renumics/spotlight/data_store.py
@@ -21,13 +21,16 @@
     DType,
     DTypeMap,
     EmbeddingDType,
+    array_dtype,
     is_array_dtype,
     is_audio_dtype,
     is_category_dtype,
+    is_embedding_dtype,
     is_file_dtype,
     is_str_dtype,
     is_mixed_dtype,
     is_bytes_dtype,
+    is_window_dtype,
     str_dtype,
     audio_dtype,
     image_dtype,
@@ -173,33 +176,32 @@ def _guess_dtype(self, col: str) -> DType:
             return semantic_dtype
 
         sample_values = self._data_source.get_column_values(col, slice(10))
-        sample_dtypes = [_guess_value_dtype(value) for value in sample_values]
-
-        try:
-            mode_dtype = statistics.mode(sample_dtypes)
-        except statistics.StatisticsError:
+        sample_dtypes: List[DType] = []
+        for value in sample_values:
+            guessed_dtype = _guess_value_dtype(value)
+            if guessed_dtype is not None:
+                sample_dtypes.append(guessed_dtype)
+        if not sample_dtypes:
             return semantic_dtype
 
-        return mode_dtype or semantic_dtype
+        mode_dtype = statistics.mode(sample_dtypes)
+        # For windows and embeddings, at least sample values must be aligned.
+        if is_window_dtype(mode_dtype) and any(
+            not is_window_dtype(dtype) for dtype in sample_dtypes
+        ):
+            return array_dtype
+        if is_embedding_dtype(mode_dtype) and any(
+            (not is_embedding_dtype(dtype)) or dtype.length != mode_dtype.length
+            for dtype in sample_dtypes
+        ):
+            return array_dtype
+
+        return mode_dtype
 
 
 def _intermediate_to_semantic_dtype(intermediate_dtype: DType) -> DType:
     if is_array_dtype(intermediate_dtype):
-        if intermediate_dtype.shape is None:
-            return intermediate_dtype
-        if intermediate_dtype.shape == (2,):
-            return window_dtype
-        if intermediate_dtype.ndim == 1 and intermediate_dtype.shape[0] is not None:
-            return EmbeddingDType(intermediate_dtype.shape[0])
-        if intermediate_dtype.ndim == 1 and intermediate_dtype.shape[0] is None:
-            return sequence_1d_dtype
-        if intermediate_dtype.ndim == 2 and (
-            intermediate_dtype.shape[0] == 2 or intermediate_dtype.shape[1] == 2
-        ):
-            return sequence_1d_dtype
-        if intermediate_dtype.ndim == 3 and intermediate_dtype.shape[-1] in (1, 3, 4):
-            return image_dtype
-        return intermediate_dtype
+        return _guess_array_dtype(intermediate_dtype)
     if is_file_dtype(intermediate_dtype):
         return str_dtype
     if is_mixed_dtype(intermediate_dtype):
@@ -262,5 +264,21 @@ def _guess_value_dtype(value: Any) -> Optional[DType]:
         except (TypeError, ValueError):
             pass
         else:
-            return ArrayDType(value.shape)
+            return _guess_array_dtype(ArrayDType(value.shape))
     return None
+
+
+def _guess_array_dtype(dtype: ArrayDType) -> DType:
+    if dtype.shape is None:
+        return dtype
+    if dtype.shape == (2,):
+        return window_dtype
+    if dtype.ndim == 1 and dtype.shape[0] is not None:
+        return EmbeddingDType(dtype.shape[0])
+    if dtype.ndim == 1 and dtype.shape[0] is None:
+        return sequence_1d_dtype
+    if dtype.ndim == 2 and (dtype.shape[0] == 2 or dtype.shape[1] == 2):
+        return sequence_1d_dtype
+    if dtype.ndim == 3 and dtype.shape[-1] in (1, 3, 4):
+        return image_dtype
+    return dtype
diff --git a/renumics/spotlight/dataset/__init__.py b/renumics/spotlight/dataset/__init__.py
@@ -32,12 +32,7 @@
 from typing_extensions import TypeGuard
 
 from renumics.spotlight.__version__ import __version__
-from renumics.spotlight.io.pandas import (
-    infer_dtypes,
-    prepare_column,
-    is_string_mask,
-    stringify_columns,
-)
+from .pandas import create_typed_series, infer_dtypes, is_string_mask, prepare_column
 from renumics.spotlight.typing import (
     BoolType,
     IndexType,
@@ -47,7 +42,6 @@
     is_integer,
     is_iterable,
 )
-from renumics.spotlight.io.pandas import create_typed_series
 from renumics.spotlight.dtypes.conversion import prepare_path_or_url
 from renumics.spotlight import dtypes as spotlight_dtypes
 
@@ -738,7 +732,7 @@ def from_pandas(
             df = df.reset_index(level=df.index.names)  # type: ignore
         else:
             df = df.copy()
-        df.columns = pd.Index(stringify_columns(df))
+        df.columns = pd.Index([str(column) for column in df.columns])
 
         if dtypes is None:
             dtypes = {}

diff --git a/renumics/spotlight/dataset/descriptors/__init__.py b/renumics/spotlight/dataset/descriptors/__init__.py
@@ -1,5 +1,6 @@
 """make descriptor methods more available
 """
+import warnings
 from typing import Optional, Tuple
 
 import numpy as np
@@ -11,6 +12,13 @@
 from renumics.spotlight.dataset.exceptions import ColumnExistsError, InvalidDTypeError
 from .data_alignment import align_column_data
 
+warnings.warn(
+    "`renumics.spotlight.dataset.descriptors` module is deprecated and will "
+    "be removed in future versions.",
+    DeprecationWarning,
+    stacklevel=2,
+)
+
 
 def pca(
     dataset: Dataset,