Merge branch 'Renumics:main' into main

Renumics · Oct 26, 2023 · c17b998 · c17b998
2 parents 584000e + 3bb12ed
commit c17b998
Show file tree

Hide file tree

Showing 30 changed files with 780 additions and 678 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -11,10 +11,10 @@ Technical details on how to contribute can be found in our [documentation](https
 
 There are several ways you can contribute to Spotlight:
 
-* Fix outstanding issues.
-* Implement new features.
-* Submit issues related to bugs or desired new features.
-* Share your use case
+-   Fix outstanding issues.
+-   Implement new features.
+-   Submit issues related to bugs or desired new features.
+-   Share your use case
 
 If you don't know where to start, you might want to have a look at [hacktoberfest issues](https://github.com/Renumics/spotlight/issues?q=is%3Aissue+is%3Aopen+label%3Ahacktoberfest)
 and our guide on how to create a [new Lens](https://renumics.com/docs/development/lenses).
diff --git a/README.md b/README.md
@@ -17,9 +17,10 @@
 
 <p align="center"><a href="https://spotlight.renumics.com"><img src="static/img/spotlight_video.gif" width="100%"/></a></p>
 
-Spotlight helps you to **understand unstructured datasets** fast. You can quickly create **interactive visualizations** and leverage data enrichments (e.g. embeddings, prediction, uncertainties) to **identify critical clusters** in your data. 
+Spotlight helps you to **understand unstructured datasets** fast. You can quickly create **interactive visualizations** and leverage data enrichments (e.g. embeddings, prediction, uncertainties) to **identify critical clusters** in your data.
 
 Spotlight supports most unstructured data types including **images, audio, text, videos, time-series and geometric data**. You can start from your existing dataframe:
+
 <p align="left"><img src="static/img/dataframe_head_sample.png" width="100%"/></a></p>
 
 And start Spotlight with just a few lines of code:
@@ -49,7 +50,7 @@ Machine learning and engineering teams use Spotlight to understand and communica
 	    <td rowspan="3">[Classification]</td>
             <td>Find Issues in Any Image Classification Dataset</td>
             <td><a href="https://www.renumics.com/next/docs/use-cases/image-classification">👨‍💻</a> <a href="https://medium.com/@daniel-klitzke/finding-problematic-data-slices-in-unstructured-data-aeec0a3b9a2a">📝</a> <a href="https://huggingface.co/spaces/renumics/sliceguard-unstructured-data">🕹️</a></td>
-        </tr>	
+        </tr>
         <tr>
             <td>Find data issues in the CIFAR-100 image dataset</td>
             <td><a href="https://huggingface.co/spaces/renumics/navigate-data-issues">🕹️</a></td>
@@ -91,7 +92,6 @@ Machine learning and engineering teams use Spotlight to understand and communica
     </tbody>
 </table>
 
-
 ## ⏱️ Quickstart
 
 Get started by installing Spotlight and loading your first dataset.
@@ -132,12 +132,11 @@ ds = datasets.load_dataset('renumics/emodb-enriched', split='all')
 layout= spotlight.layouts.debug_classification(label='gender', prediction='m1_gender_prediction', embedding='m1_embedding', features=['age', 'emotion'])
 spotlight.show(ds, layout=layout)
 ```
+
 Here, the data types are discovered automatically from the dataset and we use a pre-defined layout for model debugging. Custom layouts can be built programmatically or via the UI.
 
 > The `datasets[audio]` package can be installed via pip.
 
-
-
 #### Usage Tracking
 
 We have added crash report and performance collection. We do NOT collect user data other than an anonymized Machine Id obtained by py-machineid, and only log our own actions. We do NOT collect folder names, dataset names, or row data of any kind only aggregate performance statistics like total time of a table_load, crash data, etc. Collecting Spotlight crashes will help us improve stability. To opt out of the crash report collection define an environment variable called `SPOTLIGHT_OPT_OUT` and set it to true. e.G.`export SPOTLIGHT_OPT_OUT=true`
@@ -150,9 +149,9 @@ We have added crash report and performance collection. We do NOT collect user da
 
 ## Learn more about unstructured data workflows
 
-- 🤗 [Huggingface](https://huggingface.co/renumics) example spaces and datasets 
-- 🏀 [Playbook](https://renumics.com/docs/playbook/) for data-centric AI workflows
-- 🍰 [Sliceguard](https://github.com/Renumics/sliceguard) library for automatic slice detection
+-   🤗 [Huggingface](https://huggingface.co/renumics) example spaces and datasets
+-   🏀 [Playbook](https://renumics.com/docs/playbook/) for data-centric AI workflows
+-   🍰 [Sliceguard](https://github.com/Renumics/sliceguard) library for automatic slice detection
 
 ## Contribute
 

diff --git a/renumics/spotlight/app.py b/renumics/spotlight/app.py
@@ -15,6 +15,7 @@
 
 from typing_extensions import Annotated
 from fastapi import Cookie, FastAPI, Request, status
+from fastapi.datastructures import Headers
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response
 from fastapi.staticfiles import StaticFiles
@@ -56,10 +57,23 @@
 
 from renumics.spotlight.dtypes import DTypeMap
 
-
 CURRENT_LAYOUT_KEY = "layout.current"
 
 
+class UncachedStaticFiles(StaticFiles):
+    """
+    FastAPI StaticFiles but without caching
+    """
+
+    def is_not_modified(
+        self, response_headers: Headers, request_headers: Headers
+    ) -> bool:
+        """
+        Never Cache
+        """
+        return False
+
+
 class IssuesUpdatedMessage(Message):
     """
     Notify about updated issues.
@@ -75,7 +89,6 @@ class SpotlightApp(FastAPI):
     """
 
     # lifecycle
-    _startup_complete: bool
     _loop: asyncio.AbstractEventLoop
 
     # connection
@@ -106,7 +119,6 @@ class SpotlightApp(FastAPI):
 
     def __init__(self) -> None:
         super().__init__()
-        self._startup_complete = False
         self.task_manager = TaskManager()
         self.websocket_manager = None
         self.config = Config()
@@ -207,9 +219,13 @@ async def _(_: Request, problem: Problem) -> JSONResponse:
             plugin.activate(self)
 
         try:
+            # Mount frontend files as uncached,
+            # so that we always get the new frontend after updating spotlight.
+            # NOTE: we might not need this if we added a version hash
+            # to our built js files
             self.mount(
                 "/static",
-                StaticFiles(packages=["renumics.spotlight.backend"]),
+                UncachedStaticFiles(packages=["renumics.spotlight.backend"]),
                 name="assets",
             )
         except AssertionError:
@@ -295,44 +311,45 @@ def update(self, config: AppConfig) -> None:
         """
         Update application config.
         """
-        if config.project_root is not None:
-            self.project_root = config.project_root
-        if config.dtypes is not None:
-            self._user_dtypes = config.dtypes
-        if config.analyze is not None:
-            self.analyze_columns = config.analyze
-        if config.custom_issues is not None:
-            self.custom_issues = config.custom_issues
-        if config.dataset is not None:
-            self._dataset = config.dataset
-            self._data_source = create_datasource(self._dataset)
-        if config.layout is not None:
-            self._layout = config.layout or layouts.default()
-        if config.filebrowsing_allowed is not None:
-            self.filebrowsing_allowed = config.filebrowsing_allowed
-
-        if config.dtypes is not None or config.dataset is not None:
-            data_source = self._data_source
-            assert data_source is not None
-            self._data_store = DataStore(data_source, self._user_dtypes)
-            self._broadcast(RefreshMessage())
-            self._update_issues()
-        if config.layout is not None:
-            if self._data_store is not None:
-                dataset_uid = self._data_store.uid
-                future = asyncio.run_coroutine_threadsafe(
-                    self.config.remove_all(CURRENT_LAYOUT_KEY, dataset=dataset_uid),
-                    self._loop,
-                )
-                future.result()
-            self._broadcast(ResetLayoutMessage())
+        try:
+            if config.project_root is not None:
+                self.project_root = config.project_root
+            if config.dtypes is not None:
+                self._user_dtypes = config.dtypes
+            if config.analyze is not None:
+                self.analyze_columns = config.analyze
+            if config.custom_issues is not None:
+                self.custom_issues = config.custom_issues
+            if config.dataset is not None:
+                self._dataset = config.dataset
+                self._data_source = create_datasource(self._dataset)
+            if config.layout is not None:
+                self._layout = config.layout or layouts.default()
+            if config.filebrowsing_allowed is not None:
+                self.filebrowsing_allowed = config.filebrowsing_allowed
+
+            if config.dtypes is not None or config.dataset is not None:
+                data_source = self._data_source
+                assert data_source is not None
+                self._data_store = DataStore(data_source, self._user_dtypes)
+                self._broadcast(RefreshMessage())
+                self._update_issues()
+            if config.layout is not None:
+                if self._data_store is not None:
+                    dataset_uid = self._data_store.uid
+                    future = asyncio.run_coroutine_threadsafe(
+                        self.config.remove_all(CURRENT_LAYOUT_KEY, dataset=dataset_uid),
+                        self._loop,
+                    )
+                    future.result()
+                self._broadcast(ResetLayoutMessage())
 
-        for plugin in load_plugins():
-            plugin.update(self, config)
+            for plugin in load_plugins():
+                plugin.update(self, config)
+        except Exception as e:
+            self._connection.send({"kind": "update_complete", "error": e})
 
-        if not self._startup_complete:
-            self._startup_complete = True
-            self._connection.send({"kind": "startup_complete"})
+        self._connection.send({"kind": "update_complete"})
 
     def _handle_message(self, message: Any) -> None:
         kind = message.get("kind")

diff --git a/renumics/spotlight/backend/tasks/reduction.py b/renumics/spotlight/backend/tasks/reduction.py
@@ -6,11 +6,9 @@
 
 import numpy as np
 import pandas as pd
-from sklearn import preprocessing
 
-from renumics.spotlight.dataset.exceptions import ColumnNotExistsError
 from renumics.spotlight.data_store import DataStore
-from renumics.spotlight.dtypes import is_category_dtype, is_embedding_dtype
+from renumics.spotlight import dtypes
 
 SEED = 42
 
@@ -27,6 +25,7 @@ def align_data(
     """
     Align data from table's columns, remove `NaN`'s.
     """
+    from sklearn import preprocessing
 
     if not column_names or not indices:
         return np.empty(0, np.float64), []
@@ -35,7 +34,7 @@ def align_data(
     for column_name in column_names:
         dtype = data_store.dtypes[column_name]
         column_values = data_store.get_converted_values(column_name, indices)
-        if is_embedding_dtype(dtype):
+        if dtypes.is_embedding_dtype(dtype):
             embedding_length = max(
                 0 if x is None else len(cast(np.ndarray, x)) for x in column_values
             )
@@ -49,17 +48,19 @@ def align_data(
                         ]
                     )
                 )
-        elif is_category_dtype(dtype):
+        elif dtypes.is_category_dtype(dtype):
             na_mask = np.array(column_values) == -1
             one_hot_values = preprocessing.label_binarize(
                 column_values, classes=sorted(set(column_values).difference({-1}))  # type: ignore
             ).astype(float)
             one_hot_values[na_mask] = np.nan
             aligned_values.append(one_hot_values)
-        elif dtype in (int, bool, float):
+        elif dtypes.is_scalar_dtype(dtype):
             aligned_values.append(np.array(column_values, dtype=float))
         else:
-            raise ColumnNotEmbeddable
+            raise ColumnNotEmbeddable(
+                f"Column '{column_name}' of type {dtype} is not embeddable."
+            )
 
     data = np.hstack([col.reshape((len(indices), -1)) for col in aligned_values])
     mask = ~pd.isna(data).any(axis=1)
@@ -78,10 +79,8 @@ def compute_umap(
     Prepare data from table and compute U-Map on them.
     """
 
-    try:
-        data, indices = align_data(data_store, column_names, indices)
-    except (ColumnNotExistsError, ColumnNotEmbeddable):
-        return np.empty(0, np.float64), []
+    data, indices = align_data(data_store, column_names, indices)
+
     if data.size == 0:
         return np.empty(0, np.float64), []
 
@@ -114,20 +113,20 @@ def compute_pca(
     Prepare data from table and compute PCA on them.
     """
 
-    from sklearn import preprocessing, decomposition
+    data, indices = align_data(data_store, column_names, indices)
 
-    try:
-        data, indices = align_data(data_store, column_names, indices)
-    except (ColumnNotExistsError, ValueError):
-        return np.empty(0, np.float64), []
     if data.size == 0:
         return np.empty(0, np.float64), []
+
+    from sklearn import preprocessing, decomposition
+
     if data.shape[1] == 1:
         return np.hstack((data, np.zeros_like(data))), indices
     if normalization == "standardize":
         data = preprocessing.StandardScaler(copy=False).fit_transform(data)
     elif normalization == "robust standardize":
         data = preprocessing.RobustScaler(copy=False).fit_transform(data)
     reducer = decomposition.PCA(n_components=2, copy=False, random_state=SEED)
-    embeddings = reducer.fit_transform(data)
+    # `fit_transform` returns Fortran-ordered array.
+    embeddings = np.ascontiguousarray(reducer.fit_transform(data))
     return embeddings, indices
diff --git a/renumics/spotlight/backend/tasks/task_manager.py b/renumics/spotlight/backend/tasks/task_manager.py
@@ -6,7 +6,7 @@
 import multiprocessing
 from concurrent.futures import Future, ProcessPoolExecutor
 from concurrent.futures.process import BrokenProcessPool
-from typing import Any, Callable, List, Optional, Sequence, TypeVar, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, TypeVar, Union
 
 from .exceptions import TaskCancelled
 from .task import Task
@@ -30,16 +30,20 @@ def create_task(
         self,
         func: Callable,
         args: Sequence[Any],
+        kwargs: Optional[Dict[str, Any]] = None,
         name: Optional[str] = None,
         tag: Optional[Union[str, int]] = None,
     ) -> Task:
         """
         create and launch a new task
         """
+        if kwargs is None:
+            kwargs = {}
+
         # cancel running task with same name
         self.cancel(name=name)
 
-        future = self.pool.submit(func, *args)
+        future = self.pool.submit(func, *args, **kwargs)
 
         task = Task(name, tag, future)
         self.tasks.append(task)
@@ -59,14 +63,15 @@ async def run_async(
         self,
         func: Callable[..., T],
         args: Sequence[Any],
+        kwargs: Optional[Dict[str, Any]] = None,
         name: Optional[str] = None,
         tag: Optional[Union[str, int]] = None,
     ) -> T:
         """
         Launch a new task. Await and return result.
         """
 
-        task = self.create_task(func, args, name, tag)
+        task = self.create_task(func, args=args, kwargs=kwargs, name=name, tag=tag)
         try:
             return await asyncio.wrap_future(task.future)
         except BrokenProcessPool as e: