Skip to content

Commit

Permalink
BUG Better in-out support with different arrays type (#681)
Browse files Browse the repository at this point in the history
  • Loading branch information
chkoar authored Feb 12, 2020
1 parent 3ede269 commit 4ba2803
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 103 deletions.
49 changes: 7 additions & 42 deletions imblearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from sklearn.utils.multiclass import check_classification_targets

from .utils import check_sampling_strategy, check_target_type
from .utils._validation import ArraysTransformer


class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
Expand Down Expand Up @@ -72,6 +73,7 @@ def fit_resample(self, X, y):
The corresponding label of `X_resampled`.
"""
check_classification_targets(y)
arrays_transformer = ArraysTransformer(X, y)
X, y, binarize_y = self._check_X_y(X, y)

self.sampling_strategy_ = check_sampling_strategy(
Expand All @@ -80,21 +82,10 @@ def fit_resample(self, X, y):

output = self._fit_resample(X, y)

if self._X_columns is not None or self._y_name is not None:
import pandas as pd

if self._X_columns is not None:
X_ = pd.DataFrame(output[0], columns=self._X_columns)
X_ = X_.astype(self._X_dtypes)
else:
X_ = output[0]

y_ = (label_binarize(output[1], np.unique(y))
if binarize_y else output[1])

if self._y_name is not None:
y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)

X_, y_ = arrays_transformer.transform(output[0], y_)
return (X_, y_) if len(output) == 2 else (X_, y_, output[2])

# define an alias for back-compatibility
Expand Down Expand Up @@ -137,22 +128,6 @@ def __init__(self, sampling_strategy="auto"):
self.sampling_strategy = sampling_strategy

def _check_X_y(self, X, y, accept_sparse=None):
if hasattr(X, "loc"):
# store information to build dataframe
self._X_columns = X.columns
self._X_dtypes = X.dtypes
else:
self._X_columns = None
self._X_dtypes = None

if hasattr(y, "loc"):
# store information to build a series
self._y_name = y.name
self._y_dtype = y.dtype
else:
self._y_name = None
self._y_dtype = None

if accept_sparse is None:
accept_sparse = ["csr", "csc"]
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
Expand Down Expand Up @@ -265,8 +240,8 @@ def fit_resample(self, X, y):
y_resampled : array-like of shape (n_samples_new,)
The corresponding label of `X_resampled`.
"""
# store the columns name to reconstruct a dataframe
self._columns = X.columns if hasattr(X, "loc") else None
arrays_transformer = ArraysTransformer(X, y)

if self.validate:
check_classification_targets(y)
X, y, binarize_y = self._check_X_y(
Expand All @@ -280,22 +255,12 @@ def fit_resample(self, X, y):
output = self._fit_resample(X, y)

if self.validate:
if self._X_columns is not None or self._y_name is not None:
import pandas as pd

if self._X_columns is not None:
X_ = pd.DataFrame(output[0], columns=self._X_columns)
X_ = X_.astype(self._X_dtypes)
else:
X_ = output[0]

y_ = (label_binarize(output[1], np.unique(y))
if binarize_y else output[1])

if self._y_name is not None:
y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)

X_, y_ = arrays_transformer.transform(output[0], y_)
return (X_, y_) if len(output) == 2 else (X_, y_, output[2])

return output

def _fit_resample(self, X, y):
Expand Down
17 changes: 0 additions & 17 deletions imblearn/over_sampling/_random_over_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from ..utils import Substitution
from ..utils._docstring import _random_state_docstring


@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
random_state=_random_state_docstring,
Expand Down Expand Up @@ -75,22 +74,6 @@ def __init__(self, sampling_strategy="auto", random_state=None):
self.random_state = random_state

def _check_X_y(self, X, y):
if hasattr(X, "loc"):
# store information to build dataframe
self._X_columns = X.columns
self._X_dtypes = X.dtypes
else:
self._X_columns = None
self._X_dtypes = None

if hasattr(y, "loc"):
# store information to build a series
self._y_name = y.name
self._y_dtype = y.dtype
else:
self._y_name = None
self._y_dtype = None

y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
force_all_finite=False)
Expand Down
16 changes: 0 additions & 16 deletions imblearn/over_sampling/_smote.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,22 +891,6 @@ def _check_X_y(self, X, y):
"""Overwrite the checking to let pass some string for categorical
features.
"""
if hasattr(X, "loc"):
# store information to build dataframe
self._X_columns = X.columns
self._X_dtypes = X.dtypes
else:
self._X_columns = None
self._X_dtypes = None

if hasattr(y, "loc"):
# store information to build a series
self._y_name = y.name
self._y_dtype = y.dtype
else:
self._y_name = None
self._y_dtype = None

y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None)
return X, y, binarize_y
Expand Down
6 changes: 4 additions & 2 deletions imblearn/under_sampling/_prototype_selection/_nearmiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,10 @@ def _fit_resample(self, X, y):
_safe_indexing(X, minority_class_indices)
)
idx_vec_farthest = np.unique(idx_vec.reshape(-1))
X_class_selected = _safe_indexing(X_class, idx_vec_farthest)
y_class_selected = _safe_indexing(y_class, idx_vec_farthest)
X_class_selected = _safe_indexing(
X_class, idx_vec_farthest)
y_class_selected = _safe_indexing(
y_class, idx_vec_farthest)

dist_vec, idx_vec = self.nn_.kneighbors(
X_class_selected, n_neighbors=self.nn_.n_neighbors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,22 +81,6 @@ def __init__(
self.replacement = replacement

def _check_X_y(self, X, y):
if hasattr(X, "loc"):
# store information to build dataframe
self._X_columns = X.columns
self._X_dtypes = X.dtypes
else:
self._X_columns = None
self._X_dtypes = None

if hasattr(y, "loc"):
# store information to build a series
self._y_name = y.name
self._y_dtype = y.dtype
else:
self._y_name = None
self._y_dtype = None

y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
force_all_finite=False)
Expand Down
36 changes: 36 additions & 0 deletions imblearn/utils/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,42 @@
TARGET_KIND = ("binary", "multiclass", "multilabel-indicator")


class ArraysTransformer:
"""A class to convert sampler ouput arrays to their orinal types."""

def __init__(self, X, y):
self.x_props = self._gets_props(X)
self.y_props = self._gets_props(y)

def transform(self, X, y):
X = self._transfrom_one(X, self.x_props)
y = self._transfrom_one(y, self.y_props)
return X, y

def _gets_props(self, array):
props = {}
props["type"] = array.__class__.__name__
props["columns"] = getattr(array, "columns", None)
props["name"] = getattr(array, "name", None)
props["dtypes"] = getattr(array, "dtypes", None)
return props

def _transfrom_one(self, array, props):
type_ = props["type"].lower()
if type_ == "list":
ret = array.tolist()
elif type_ == "dataframe":
import pandas as pd
ret = pd.DataFrame(array, columns=props["columns"])
ret = ret.astype(props["dtypes"])
elif type_ == "series":
import pandas as pd
ret = pd.Series(array, dtype=props["dtypes"], name=props["name"])
else:
ret = array
return ret


def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
"""Check the objects is consistent to be a NN.
Expand Down
58 changes: 48 additions & 10 deletions imblearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def _yield_sampler_checks(name, Estimator):
yield check_samplers_sampling_strategy_fit_resample
yield check_samplers_sparse
yield check_samplers_pandas
yield check_samplers_list
yield check_samplers_multiclass_ova
yield check_samplers_preserve_dtype
yield check_samplers_sample_indices
Expand Down Expand Up @@ -242,8 +243,9 @@ def check_samplers_pandas(name, Sampler):
weights=[0.2, 0.3, 0.5],
random_state=0,
)
X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
y_pd = pd.Series(y, name="class")
X_df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
y_df = pd.DataFrame(y)
y_s = pd.Series(y, name="class")
sampler = Sampler()
if isinstance(Sampler(), NearMiss):
samplers = [Sampler(version=version) for version in (1, 2, 3)]
Expand All @@ -253,16 +255,52 @@ def check_samplers_pandas(name, Sampler):

for sampler in samplers:
set_random_state(sampler)
X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd)
X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
X_res_df, y_res_df = sampler.fit_resample(X_df, y_df)
X_res, y_res = sampler.fit_resample(X, y)

# check that we return a pandas dataframe if a dataframe was given in
assert isinstance(X_res_pd, pd.DataFrame)
assert isinstance(y_res_pd, pd.Series)
assert X_pd.columns.to_list() == X_res_pd.columns.to_list()
assert y_pd.name == y_res_pd.name
assert_allclose(X_res_pd.to_numpy(), X_res)
assert_allclose(y_res_pd.to_numpy(), y_res)
# check that we return the same type for dataframes or series types
assert isinstance(X_res_df, pd.DataFrame)
assert isinstance(y_res_df, pd.DataFrame)
assert isinstance(y_res_s, pd.Series)

assert X_df.columns.to_list() == X_res_df.columns.to_list()
assert y_df.columns.to_list() == y_res_df.columns.to_list()
assert y_s.name == y_res_s.name

assert_allclose(X_res_df.to_numpy(), X_res)
assert_allclose(y_res_df.to_numpy().ravel(), y_res)
assert_allclose(y_res_s.to_numpy(), y_res)


def check_samplers_list(name, Sampler):
# Check that the can samplers handle simple lists
X, y = make_classification(
n_samples=1000,
n_classes=3,
n_informative=4,
weights=[0.2, 0.3, 0.5],
random_state=0,
)
X_list = X.tolist()
y_list = y.tolist()
sampler = Sampler()
if isinstance(sampler, NearMiss):
samplers = [Sampler(version=version) for version in (1, 2, 3)]

else:
samplers = [sampler]

for sampler in samplers:
set_random_state(sampler)
X_res, y_res = sampler.fit_resample(X, y)
X_res_list, y_res_list = sampler.fit_resample(X_list, y_list)

assert isinstance(X_res_list, list)
assert isinstance(y_res_list, list)

assert_allclose(X_res, X_res_list)
assert_allclose(y_res, y_res_list)


def check_samplers_multiclass_ova(name, Sampler):
Expand Down
51 changes: 51 additions & 0 deletions imblearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from imblearn.utils import check_neighbors_object
from imblearn.utils import check_sampling_strategy
from imblearn.utils import check_target_type
from imblearn.utils._validation import ArraysTransformer

multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25)
binary_target = np.array([1] * 25 + [0] * 100)
Expand Down Expand Up @@ -315,3 +316,53 @@ def test_sampling_strategy_check_order(
sampling_strategy, y, sampling_type
)
assert sampling_strategy_ == expected_result


def test_arrays_transformer_plain_list():
X = np.array([[0, 0], [1, 1]])
y = np.array([[0, 0], [1, 1]])

arrays_transformer = ArraysTransformer(X.tolist(), y.tolist())
X_res, y_res = arrays_transformer.transform(X, y)
assert isinstance(X_res, list)
assert isinstance(y_res, list)


def test_arrays_transformer_numpy():
X = np.array([[0, 0], [1, 1]])
y = np.array([[0, 0], [1, 1]])

arrays_transformer = ArraysTransformer(X, y)
X_res, y_res = arrays_transformer.transform(X, y)
assert isinstance(X_res, np.ndarray)
assert isinstance(y_res, np.ndarray)


def test_arrays_transformer_pandas():
pd = pytest.importorskip("pandas")

X = np.array([[0, 0], [1, 1]])
y = np.array([0, 1])

X_df = pd.DataFrame(X, columns=["a", "b"])
X_df = X_df.astype(int)
y_df = pd.DataFrame(y, columns=["target", ])
y_df = y_df.astype(int)
y_s = pd.Series(y, name="target", dtype=int)

# DataFrame and DataFrame case
arrays_transformer = ArraysTransformer(X_df, y_df)
X_res, y_res = arrays_transformer.transform(X, y)
assert isinstance(X_res, pd.DataFrame)
assert_array_equal(X_res.columns, X_df.columns)
assert_array_equal(X_res.dtypes, X_df.dtypes)
assert isinstance(y_res, pd.DataFrame)
assert_array_equal(y_res.columns, y_df.columns)
assert_array_equal(y_res.dtypes, y_df.dtypes)

# DataFrames and Series case
arrays_transformer = ArraysTransformer(X_df, y_s)
_, y_res = arrays_transformer.transform(X, y)
assert isinstance(y_res, pd.Series)
assert_array_equal(y_res.name, y_s.name)
assert_array_equal(y_res.dtype, y_s.dtype)

0 comments on commit 4ba2803

Please sign in to comment.