diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d6628c4f..e71d164e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,10 +11,10 @@ Technical details on how to contribute can be found in our [documentation](https
There are several ways you can contribute to Spotlight:
-* Fix outstanding issues.
-* Implement new features.
-* Submit issues related to bugs or desired new features.
-* Share your use case
+- Fix outstanding issues.
+- Implement new features.
+- Submit issues related to bugs or desired new features.
+- Share your use case
If you don't know where to start, you might want to have a look at [hacktoberfest issues](https://github.com/Renumics/spotlight/issues?q=is%3Aissue+is%3Aopen+label%3Ahacktoberfest)
and our guide on how to create a [new Lens](https://renumics.com/docs/development/lenses).
diff --git a/README.md b/README.md
index 6c68d669..320f9e13 100644
--- a/README.md
+++ b/README.md
@@ -17,9 +17,10 @@
-Spotlight helps you to **understand unstructured datasets** fast. You can quickly create **interactive visualizations** and leverage data enrichments (e.g. embeddings, prediction, uncertainties) to **identify critical clusters** in your data.
+Spotlight helps you to **understand unstructured datasets** fast. You can quickly create **interactive visualizations** and leverage data enrichments (e.g. embeddings, prediction, uncertainties) to **identify critical clusters** in your data.
Spotlight supports most unstructured data types including **images, audio, text, videos, time-series and geometric data**. You can start from your existing dataframe:
+
And start Spotlight with just a few lines of code:
@@ -49,7 +50,7 @@ Machine learning and engineering teams use Spotlight to understand and communica
[Classification] |
Find Issues in Any Image Classification Dataset |
đ¨âđģ đ đšī¸ |
-
+
Find data issues in the CIFAR-100 image dataset |
đšī¸ |
@@ -91,7 +92,6 @@ Machine learning and engineering teams use Spotlight to understand and communica
-
## âąī¸ Quickstart
Get started by installing Spotlight and loading your first dataset.
@@ -132,12 +132,11 @@ ds = datasets.load_dataset('renumics/emodb-enriched', split='all')
layout= spotlight.layouts.debug_classification(label='gender', prediction='m1_gender_prediction', embedding='m1_embedding', features=['age', 'emotion'])
spotlight.show(ds, layout=layout)
```
+
Here, the data types are discovered automatically from the dataset and we use a pre-defined layout for model debugging. Custom layouts can be built programmatically or via the UI.
> The `datasets[audio]` package can be installed via pip.
-
-
#### Usage Tracking
We have added crash report and performance collection. We do NOT collect user data other than an anonymized Machine Id obtained by py-machineid, and only log our own actions. We do NOT collect folder names, dataset names, or row data of any kind only aggregate performance statistics like total time of a table_load, crash data, etc. Collecting Spotlight crashes will help us improve stability. To opt out of the crash report collection define an environment variable called `SPOTLIGHT_OPT_OUT` and set it to true. e.G.`export SPOTLIGHT_OPT_OUT=true`
@@ -150,9 +149,9 @@ We have added crash report and performance collection. We do NOT collect user da
## Learn more about unstructured data workflows
-- đ¤ [Huggingface](https://huggingface.co/renumics) example spaces and datasets
-- đ [Playbook](https://renumics.com/docs/playbook/) for data-centric AI workflows
-- đ° [Sliceguard](https://github.com/Renumics/sliceguard) library for automatic slice detection
+- đ¤ [Huggingface](https://huggingface.co/renumics) example spaces and datasets
+- đ [Playbook](https://renumics.com/docs/playbook/) for data-centric AI workflows
+- đ° [Sliceguard](https://github.com/Renumics/sliceguard) library for automatic slice detection
## Contribute
diff --git a/renumics/spotlight/data_source/data_source.py b/renumics/spotlight/data_source/data_source.py
index d06133f3..695acb5d 100644
--- a/renumics/spotlight/data_source/data_source.py
+++ b/renumics/spotlight/data_source/data_source.py
@@ -6,7 +6,6 @@
import pandas as pd
import numpy as np
-from pydantic.dataclasses import dataclass
from renumics.spotlight.dataset.exceptions import (
ColumnExistsError,
@@ -30,17 +29,6 @@ class ColumnMetadata:
tags: List[str] = dataclasses.field(default_factory=list)
-@dataclass
-class CellsUpdate:
- """
- A dataset's cell update.
- """
-
- value: Any
- author: str
- edited_at: str
-
-
class DataSource(ABC):
"""abstract base class for different data sources"""
@@ -61,7 +49,7 @@ def column_names(self) -> List[str]:
@abstractmethod
def intermediate_dtypes(self) -> DTypeMap:
"""
- The dtypes of intermediate values
+ The dtypes of intermediate values. Values for all columns must be filled.
"""
@property
@@ -94,7 +82,7 @@ def check_generation_id(self, generation_id: int) -> None:
@abstractmethod
def semantic_dtypes(self) -> DTypeMap:
"""
- Semantic dtypes for viewer.
+ Semantic dtypes for viewer. Some values may be not present.
"""
@abstractmethod
diff --git a/renumics/spotlight/data_store.py b/renumics/spotlight/data_store.py
index 2eb0108b..ba940254 100644
--- a/renumics/spotlight/data_store.py
+++ b/renumics/spotlight/data_store.py
@@ -21,13 +21,16 @@
DType,
DTypeMap,
EmbeddingDType,
+ array_dtype,
is_array_dtype,
is_audio_dtype,
is_category_dtype,
+ is_embedding_dtype,
is_file_dtype,
is_str_dtype,
is_mixed_dtype,
is_bytes_dtype,
+ is_window_dtype,
str_dtype,
audio_dtype,
image_dtype,
@@ -173,33 +176,32 @@ def _guess_dtype(self, col: str) -> DType:
return semantic_dtype
sample_values = self._data_source.get_column_values(col, slice(10))
- sample_dtypes = [_guess_value_dtype(value) for value in sample_values]
-
- try:
- mode_dtype = statistics.mode(sample_dtypes)
- except statistics.StatisticsError:
+ sample_dtypes: List[DType] = []
+ for value in sample_values:
+ guessed_dtype = _guess_value_dtype(value)
+ if guessed_dtype is not None:
+ sample_dtypes.append(guessed_dtype)
+ if not sample_dtypes:
return semantic_dtype
- return mode_dtype or semantic_dtype
+ mode_dtype = statistics.mode(sample_dtypes)
+ # For windows and embeddings, at least sample values must be aligned.
+ if is_window_dtype(mode_dtype) and any(
+ not is_window_dtype(dtype) for dtype in sample_dtypes
+ ):
+ return array_dtype
+ if is_embedding_dtype(mode_dtype) and any(
+ (not is_embedding_dtype(dtype)) or dtype.length != mode_dtype.length
+ for dtype in sample_dtypes
+ ):
+ return array_dtype
+
+ return mode_dtype
def _intermediate_to_semantic_dtype(intermediate_dtype: DType) -> DType:
if is_array_dtype(intermediate_dtype):
- if intermediate_dtype.shape is None:
- return intermediate_dtype
- if intermediate_dtype.shape == (2,):
- return window_dtype
- if intermediate_dtype.ndim == 1 and intermediate_dtype.shape[0] is not None:
- return EmbeddingDType(intermediate_dtype.shape[0])
- if intermediate_dtype.ndim == 1 and intermediate_dtype.shape[0] is None:
- return sequence_1d_dtype
- if intermediate_dtype.ndim == 2 and (
- intermediate_dtype.shape[0] == 2 or intermediate_dtype.shape[1] == 2
- ):
- return sequence_1d_dtype
- if intermediate_dtype.ndim == 3 and intermediate_dtype.shape[-1] in (1, 3, 4):
- return image_dtype
- return intermediate_dtype
+ return _guess_array_dtype(intermediate_dtype)
if is_file_dtype(intermediate_dtype):
return str_dtype
if is_mixed_dtype(intermediate_dtype):
@@ -262,5 +264,21 @@ def _guess_value_dtype(value: Any) -> Optional[DType]:
except (TypeError, ValueError):
pass
else:
- return ArrayDType(value.shape)
+ return _guess_array_dtype(ArrayDType(value.shape))
return None
+
+
+def _guess_array_dtype(dtype: ArrayDType) -> DType:
+ if dtype.shape is None:
+ return dtype
+ if dtype.shape == (2,):
+ return window_dtype
+ if dtype.ndim == 1 and dtype.shape[0] is not None:
+ return EmbeddingDType(dtype.shape[0])
+ if dtype.ndim == 1 and dtype.shape[0] is None:
+ return sequence_1d_dtype
+ if dtype.ndim == 2 and (dtype.shape[0] == 2 or dtype.shape[1] == 2):
+ return sequence_1d_dtype
+ if dtype.ndim == 3 and dtype.shape[-1] in (1, 3, 4):
+ return image_dtype
+ return dtype
diff --git a/renumics/spotlight/dataset/__init__.py b/renumics/spotlight/dataset/__init__.py
index 99828ddf..e586499d 100644
--- a/renumics/spotlight/dataset/__init__.py
+++ b/renumics/spotlight/dataset/__init__.py
@@ -32,12 +32,7 @@
from typing_extensions import TypeGuard
from renumics.spotlight.__version__ import __version__
-from renumics.spotlight.io.pandas import (
- infer_dtypes,
- prepare_column,
- is_string_mask,
- stringify_columns,
-)
+from .pandas import create_typed_series, infer_dtypes, is_string_mask, prepare_column
from renumics.spotlight.typing import (
BoolType,
IndexType,
@@ -47,7 +42,6 @@
is_integer,
is_iterable,
)
-from renumics.spotlight.io.pandas import create_typed_series
from renumics.spotlight.dtypes.conversion import prepare_path_or_url
from renumics.spotlight import dtypes as spotlight_dtypes
@@ -738,7 +732,7 @@ def from_pandas(
df = df.reset_index(level=df.index.names) # type: ignore
else:
df = df.copy()
- df.columns = pd.Index(stringify_columns(df))
+ df.columns = pd.Index([str(column) for column in df.columns])
if dtypes is None:
dtypes = {}
diff --git a/renumics/spotlight/io/pandas.py b/renumics/spotlight/dataset/pandas.py
similarity index 86%
rename from renumics/spotlight/io/pandas.py
rename to renumics/spotlight/dataset/pandas.py
index 4cf84f9e..75ccf00f 100644
--- a/renumics/spotlight/io/pandas.py
+++ b/renumics/spotlight/dataset/pandas.py
@@ -1,30 +1,22 @@
"""
-This module contains helpers for importing `pandas.DataFrame`s.
+Helper for conversion between H5 dataset and `pandas.DataFrame`.
"""
-import ast
import os.path
import statistics
-from contextlib import suppress
-from typing import Any, Dict, List, Optional, Sequence, Union
+from typing import Any, Optional, Sequence, Union
import PIL.Image
import filetype
-import trimesh
import numpy as np
import pandas as pd
+import trimesh
-from renumics.spotlight.dtypes import (
- Audio,
- Embedding,
- Image,
- Mesh,
- Sequence1D,
- Video,
-)
-from renumics.spotlight.media.exceptions import UnsupportedDType
-from renumics.spotlight.typing import is_iterable, is_pathtype
from renumics.spotlight import dtypes
+from renumics.spotlight.io import prepare_hugging_face_dict, try_literal_eval
+from renumics.spotlight.media import Audio, Embedding, Image, Mesh, Sequence1D, Video
+from renumics.spotlight.typing import is_iterable, is_pathtype
+from .exceptions import InvalidDTypeError
def create_typed_series(
@@ -58,32 +50,62 @@ def create_typed_series(
return pd.Series([] if values is None else values, dtype=pandas_dtype)
-def is_empty(value: Any) -> bool:
- """
- Check if value is `NA` or an empty string.
+def prepare_column(column: pd.Series, dtype: dtypes.DType) -> pd.Series:
"""
- if is_iterable(value):
- # `pd.isna` with an iterable argument returns an iterable result. But
- # an iterable cannot be NA or empty string by default.
- return False
- return pd.isna(value) or value == ""
+ Convert a `pandas` column to the desired `dtype` and prepare some values,
+ but still as `pandas` column.
+
+ Args:
+ column: A `pandas` column to prepare.
+ dtype: Target data type.
+ Returns:
+ Prepared `pandas` column.
-def try_literal_eval(x: str) -> Any:
- """
- Try to evaluate a literal expression, otherwise return value as is.
+ Raises:
+ TypeError: If `dtype` is not a Spotlight data type.
"""
- with suppress(Exception):
- return ast.literal_eval(x)
- return x
+ column = column.copy()
+ if dtypes.is_category_dtype(dtype):
+ # We only support string/`NA` categories, but `pandas` can more, so
+ # force categories to be strings (does not affect `NA`s).
+ return to_categorical(column, str_categories=True)
-def stringify_columns(df: pd.DataFrame) -> List[str]:
- """
- Convert `pandas.DataFrame`'s column names to strings, no matter which index
- is used.
- """
- return [str(column_name) for column_name in df.columns]
+ if dtypes.is_datetime_dtype(dtype):
+ # `errors="coerce"` will produce `NaT`s instead of fail.
+ return pd.to_datetime(column, errors="coerce")
+
+ if dtypes.is_str_dtype(dtype):
+ # Allow `NA`s, convert all other elements to strings.
+ return column.astype(str).mask(column.isna(), None) # type: ignore
+
+ if dtypes.is_bool_dtype(dtype):
+ return column.astype(bool)
+
+ if dtypes.is_int_dtype(dtype):
+ return column.astype(int)
+
+ if dtypes.is_float_dtype(dtype):
+ return column.astype(float)
+
+ # We explicitely don't want to change the original `DataFrame`.
+ with pd.option_context("mode.chained_assignment", None):
+ # We consider empty strings as `NA`s.
+ str_mask = is_string_mask(column)
+ column[str_mask] = column[str_mask].replace("", None)
+ na_mask = column.isna()
+
+ # When `pandas` reads a csv, arrays and lists are read as literal strings,
+ # try to interpret them.
+ str_mask = is_string_mask(column)
+ column[str_mask] = column[str_mask].apply(try_literal_eval)
+
+ if dtypes.is_filebased_dtype(dtype):
+ dict_mask = column.map(type) == dict
+ column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict)
+
+ return column.mask(na_mask, None) # type: ignore
def infer_dtype(column: pd.Series) -> dtypes.DType:
@@ -225,7 +247,7 @@ def infer_dtypes(df: pd.DataFrame, dtype: Optional[dtypes.DTypeMap]) -> dtypes.D
if column_index not in inferred_dtype:
try:
column_type = infer_dtype(df[column_index])
- except UnsupportedDType:
+ except InvalidDTypeError:
column_type = dtypes.str_dtype
inferred_dtype[str(column_index)] = column_type
return inferred_dtype
@@ -255,73 +277,3 @@ def to_categorical(column: pd.Series, str_categories: bool = False) -> pd.Series
if str_categories:
return column.cat.rename_categories(column.cat.categories.astype(str))
return column
-
-
-def prepare_hugging_face_dict(x: Dict) -> Any:
- """
- Prepare HuggingFace format for files to be used in Spotlight.
- """
- if x.keys() != {"bytes", "path"}:
- return x
- blob = x["bytes"]
- if blob is not None:
- return blob
- return x["path"]
-
-
-def prepare_column(column: pd.Series, dtype: dtypes.DType) -> pd.Series:
- """
- Convert a `pandas` column to the desired `dtype` and prepare some values,
- but still as `pandas` column.
-
- Args:
- column: A `pandas` column to prepare.
- dtype: Target data type.
-
- Returns:
- Prepared `pandas` column.
-
- Raises:
- TypeError: If `dtype` is not a Spotlight data type.
- """
- column = column.copy()
-
- if dtypes.is_category_dtype(dtype):
- # We only support string/`NA` categories, but `pandas` can more, so
- # force categories to be strings (does not affect `NA`s).
- return to_categorical(column, str_categories=True)
-
- if dtypes.is_datetime_dtype(dtype):
- # `errors="coerce"` will produce `NaT`s instead of fail.
- return pd.to_datetime(column, errors="coerce")
-
- if dtypes.is_str_dtype(dtype):
- # Allow `NA`s, convert all other elements to strings.
- return column.astype(str).mask(column.isna(), None) # type: ignore
-
- if dtypes.is_bool_dtype(dtype):
- return column.astype(bool)
-
- if dtypes.is_int_dtype(dtype):
- return column.astype(int)
-
- if dtypes.is_float_dtype(dtype):
- return column.astype(float)
-
- # We explicitely don't want to change the original `DataFrame`.
- with pd.option_context("mode.chained_assignment", None):
- # We consider empty strings as `NA`s.
- str_mask = is_string_mask(column)
- column[str_mask] = column[str_mask].replace("", None)
- na_mask = column.isna()
-
- # When `pandas` reads a csv, arrays and lists are read as literal strings,
- # try to interpret them.
- str_mask = is_string_mask(column)
- column[str_mask] = column[str_mask].apply(try_literal_eval)
-
- if dtypes.is_filebased_dtype(dtype):
- dict_mask = column.map(type) == dict
- column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict)
-
- return column.mask(na_mask, None) # type: ignore
diff --git a/renumics/spotlight/dtypes/__init__.py b/renumics/spotlight/dtypes/__init__.py
index 0e24ea10..63910215 100644
--- a/renumics/spotlight/dtypes/__init__.py
+++ b/renumics/spotlight/dtypes/__init__.py
@@ -9,6 +9,8 @@
__all__ = [
"CategoryDType",
+ "ArrayDType",
+ "EmbeddingDType",
"Sequence1DDType",
"bool_dtype",
"int_dtype",
@@ -36,6 +38,14 @@ def __init__(self, name: str):
def __str__(self) -> str:
return self.name
+ def __eq__(self, other: Any) -> bool:
+ if isinstance(other, DType):
+ return other._name == self._name
+ return False
+
+ def __hash__(self) -> int:
+ return hash(self._name)
+
@property
def name(self) -> str:
return self._name
@@ -53,8 +63,10 @@ def __init__(
self, categories: Optional[Union[Iterable[str], Dict[str, int]]] = None
):
super().__init__("Category")
- if isinstance(categories, dict) or categories is None:
- self._categories = categories
+ if isinstance(categories, dict):
+ self._categories = dict(sorted(categories.items(), key=lambda x: x[1]))
+ elif categories is None:
+ self._categories = None
else:
self._categories = {
category: code for code, category in enumerate(categories)
@@ -71,6 +83,20 @@ def __init__(
category: code for code, category in self._inverted_categories.items()
}
+ def __eq__(self, other: Any) -> bool:
+ if isinstance(other, CategoryDType):
+ return other._categories == self._categories
+ return False
+
+ def __hash__(self) -> int:
+ if self._categories is None:
+ return hash(self._name) ^ hash(None)
+ return (
+ hash(self._name)
+ ^ hash(tuple(self._categories.keys()))
+ ^ hash(tuple(self._categories.values()))
+ )
+
@property
def categories(self) -> Optional[Dict[str, int]]:
return self._categories
@@ -91,6 +117,14 @@ def __init__(self, shape: Optional[Tuple[Optional[int], ...]] = None):
super().__init__("array")
self.shape = shape
+ def __eq__(self, other: Any) -> bool:
+ if isinstance(other, ArrayDType):
+ return other.shape == self.shape
+ return False
+
+ def __hash__(self) -> int:
+ return hash(self._name) ^ hash(self.shape)
+
@property
def ndim(self) -> int:
if self.shape is None:
@@ -111,6 +145,14 @@ def __init__(self, length: Optional[int] = None):
raise ValueError(f"Length must be non-negative, but {length} received.")
self.length = length
+ def __eq__(self, other: Any) -> bool:
+ if isinstance(other, EmbeddingDType):
+ return other.length == self.length
+ return False
+
+ def __hash__(self) -> int:
+ return hash(self._name) ^ hash(self.length)
+
class Sequence1DDType(DType):
"""
@@ -125,6 +167,14 @@ def __init__(self, x_label: str = "x", y_label: str = "y"):
self.x_label = x_label
self.y_label = y_label
+ def __eq__(self, other: Any) -> bool:
+ if isinstance(other, Sequence1DDType):
+ return other.x_label == self.x_label and other.y_label == self.y_label
+ return False
+
+ def __hash__(self) -> int:
+ return hash(self._name) ^ hash(self.x_label) ^ hash(self.y_label)
+
ALIASES: Dict[Any, DType] = {}
diff --git a/renumics/spotlight/io/__init__.py b/renumics/spotlight/io/__init__.py
index 2d2a6d26..8162a843 100644
--- a/renumics/spotlight/io/__init__.py
+++ b/renumics/spotlight/io/__init__.py
@@ -1,6 +1,9 @@
"""
Reading and writing of different data formats.
"""
+import ast
+from contextlib import suppress
+from typing import Any
from .audio import (
get_format_codec,
@@ -19,6 +22,8 @@
decode_gltf_arrays,
encode_gltf_array,
)
+from .huggingface import prepare_hugging_face_dict
+
__all__ = [
"get_format_codec",
@@ -34,4 +39,15 @@
"check_gltf",
"decode_gltf_arrays",
"encode_gltf_array",
+ "prepare_hugging_face_dict",
+ "try_literal_eval",
]
+
+
+def try_literal_eval(x: str) -> Any:
+ """
+ Try to evaluate a literal expression, otherwise return value as is.
+ """
+ with suppress(Exception):
+ return ast.literal_eval(x)
+ return x
diff --git a/renumics/spotlight/io/huggingface.py b/renumics/spotlight/io/huggingface.py
new file mode 100644
index 00000000..06c0d441
--- /dev/null
+++ b/renumics/spotlight/io/huggingface.py
@@ -0,0 +1,16 @@
+"""
+Helpers for HuggingFace formats.
+"""
+from typing import Any, Dict
+
+
+def prepare_hugging_face_dict(x: Dict) -> Any:
+ """
+ Prepare HuggingFace format for files to be used in Spotlight.
+ """
+ if x.keys() != {"bytes", "path"}:
+ return x
+ blob = x["bytes"]
+ if blob is not None:
+ return blob
+ return x["path"]
diff --git a/renumics/spotlight_plugins/core/pandas_data_source.py b/renumics/spotlight_plugins/core/pandas_data_source.py
index 430a69ca..4404a14d 100644
--- a/renumics/spotlight_plugins/core/pandas_data_source.py
+++ b/renumics/spotlight_plugins/core/pandas_data_source.py
@@ -7,14 +7,9 @@
import numpy as np
import pandas as pd
import datasets
-from renumics.spotlight import dtypes
-from renumics.spotlight.io.pandas import (
- infer_dtype,
- prepare_hugging_face_dict,
- stringify_columns,
- try_literal_eval,
-)
+from renumics.spotlight import dtypes
+from renumics.spotlight.io import prepare_hugging_face_dict, try_literal_eval
from renumics.spotlight.data_source import (
datasource,
ColumnMetadata,
@@ -23,7 +18,6 @@
from renumics.spotlight.backend.exceptions import DatasetColumnsNotUnique
from renumics.spotlight.dataset.exceptions import ColumnNotExistsError
from renumics.spotlight.data_source.exceptions import InvalidDataSource
-from renumics.spotlight.dtypes import DTypeMap
@datasource(pd.DataFrame)
@@ -41,6 +35,7 @@ class PandasDataSource(DataSource):
_uid: str
_df: pd.DataFrame
_name: str
+ _intermediate_dtypes: dtypes.DTypeMap
def __init__(self, source: Union[Path, pd.DataFrame]):
if isinstance(source, Path):
@@ -108,7 +103,7 @@ def __init__(self, source: Union[Path, pd.DataFrame]):
@property
def column_names(self) -> List[str]:
- return stringify_columns(self._df)
+ return [str(column) for column in self._df.columns]
@property
def df(self) -> pd.DataFrame:
@@ -118,18 +113,15 @@ def df(self) -> pd.DataFrame:
return self._df.copy()
@property
- def intermediate_dtypes(self) -> DTypeMap:
+ def intermediate_dtypes(self) -> dtypes.DTypeMap:
return self._intermediate_dtypes
def __len__(self) -> int:
return len(self._df)
@property
- def semantic_dtypes(self) -> DTypeMap:
- return {
- str(column_name): infer_dtype(self.df[column_name])
- for column_name in self.df
- }
+ def semantic_dtypes(self) -> dtypes.DTypeMap:
+ return {}
def get_generation_id(self) -> int:
return self._generation_id
@@ -167,12 +159,14 @@ def get_column_values(
if pd.api.types.is_categorical_dtype(column):
return column.cat.codes
if pd.api.types.is_string_dtype(column):
- values = column.to_numpy()
- na_mask = column.isna()
- values[na_mask] = None
- return values
+ column = column.astype(object).mask(column.isna(), None)
+ str_mask = column.map(type) == str
+ column[str_mask] = column[str_mask].apply(try_literal_eval)
+ dict_mask = column.map(type) == dict
+ column[dict_mask] = column[dict_mask].apply(prepare_hugging_face_dict)
+ return column.to_numpy()
if pd.api.types.is_object_dtype(column):
- column = column.mask(column.isna(), None)
+ column = column.astype(object).mask(column.isna(), None)
str_mask = column.map(type) == str
column[str_mask] = column[str_mask].apply(try_literal_eval)
dict_mask = column.map(type) == dict
@@ -222,5 +216,4 @@ def _determine_intermediate_dtype(column: pd.Series) -> dtypes.DType:
return dtypes.datetime_dtype
if pd.api.types.is_string_dtype(column):
return dtypes.str_dtype
- else:
- return dtypes.mixed_dtype
+ return dtypes.mixed_dtype
diff --git a/tests/integration/dataset/test_dataset.py b/tests/integration/dataset/test_dataset.py
index 10f7ed75..fe28c8fc 100644
--- a/tests/integration/dataset/test_dataset.py
+++ b/tests/integration/dataset/test_dataset.py
@@ -26,7 +26,7 @@
from renumics.spotlight.dataset import escape_dataset_name, unescape_dataset_name
from renumics.spotlight import dtypes
from renumics.spotlight.dataset.typing import OutputType
-from renumics.spotlight.io.pandas import infer_dtype
+from renumics.spotlight.dataset.pandas import infer_dtype
from .conftest import ColumnData
from .helpers import get_append_column_fn_name
from ..helpers import approx