From 472937840913ef27a11a0a1a66544e8d52477b1b Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Mon, 23 Oct 2023 14:02:26 +0200 Subject: [PATCH 1/2] Fix checking scalar dtypes for data alignment before embedding --- renumics/spotlight/backend/tasks/reduction.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/renumics/spotlight/backend/tasks/reduction.py b/renumics/spotlight/backend/tasks/reduction.py index 00207c38..9a8d71c1 100644 --- a/renumics/spotlight/backend/tasks/reduction.py +++ b/renumics/spotlight/backend/tasks/reduction.py @@ -10,7 +10,7 @@ from renumics.spotlight.dataset.exceptions import ColumnNotExistsError from renumics.spotlight.data_store import DataStore -from renumics.spotlight.dtypes import is_category_dtype, is_embedding_dtype +from renumics.spotlight import dtypes SEED = 42 @@ -35,7 +35,7 @@ def align_data( for column_name in column_names: dtype = data_store.dtypes[column_name] column_values = data_store.get_converted_values(column_name, indices) - if is_embedding_dtype(dtype): + if dtypes.is_embedding_dtype(dtype): embedding_length = max( 0 if x is None else len(cast(np.ndarray, x)) for x in column_values ) @@ -49,17 +49,19 @@ def align_data( ] ) ) - elif is_category_dtype(dtype): + elif dtypes.is_category_dtype(dtype): na_mask = np.array(column_values) == -1 one_hot_values = preprocessing.label_binarize( column_values, classes=sorted(set(column_values).difference({-1})) # type: ignore ).astype(float) one_hot_values[na_mask] = np.nan aligned_values.append(one_hot_values) - elif dtype in (int, bool, float): + elif dtypes.is_scalar_dtype(dtype): aligned_values.append(np.array(column_values, dtype=float)) else: - raise ColumnNotEmbeddable + raise ColumnNotEmbeddable( + "Column '{column_name}' of type {dtype} is not embeddable." + ) data = np.hstack([col.reshape((len(indices), -1)) for col in aligned_values]) mask = ~pd.isna(data).any(axis=1) From 9225698eb0c57eb4254c2a19a8c2e1fb5005354c Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Tue, 24 Oct 2023 13:27:21 +0200 Subject: [PATCH 2/2] make embeddings to C layout after PCA --- renumics/spotlight/backend/tasks/reduction.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/renumics/spotlight/backend/tasks/reduction.py b/renumics/spotlight/backend/tasks/reduction.py index 9a8d71c1..7de7158f 100644 --- a/renumics/spotlight/backend/tasks/reduction.py +++ b/renumics/spotlight/backend/tasks/reduction.py @@ -6,7 +6,6 @@ import numpy as np import pandas as pd -from sklearn import preprocessing from renumics.spotlight.dataset.exceptions import ColumnNotExistsError from renumics.spotlight.data_store import DataStore @@ -27,6 +26,7 @@ def align_data( """ Align data from table's columns, remove `NaN`'s. """ + from sklearn import preprocessing if not column_names or not indices: return np.empty(0, np.float64), [] @@ -60,7 +60,7 @@ def align_data( aligned_values.append(np.array(column_values, dtype=float)) else: raise ColumnNotEmbeddable( - "Column '{column_name}' of type {dtype} is not embeddable." + f"Column '{column_name}' of type {dtype} is not embeddable." ) data = np.hstack([col.reshape((len(indices), -1)) for col in aligned_values]) @@ -120,7 +120,7 @@ def compute_pca( try: data, indices = align_data(data_store, column_names, indices) - except (ColumnNotExistsError, ValueError): + except (ColumnNotExistsError, ColumnNotEmbeddable): return np.empty(0, np.float64), [] if data.size == 0: return np.empty(0, np.float64), [] @@ -131,5 +131,6 @@ def compute_pca( elif normalization == "robust standardize": data = preprocessing.RobustScaler(copy=False).fit_transform(data) reducer = decomposition.PCA(n_components=2, copy=False, random_state=SEED) - embeddings = reducer.fit_transform(data) + # `fit_transform` returns Fortran-ordered array. + embeddings = np.ascontiguousarray(reducer.fit_transform(data)) return embeddings, indices