Skip to content

Commit

Permalink
Merge branch 'sigmamaster'
Browse files Browse the repository at this point in the history
# Conflicts:
#	dist/lightautoml-0.3.8b1-py3-none-any.whl
  • Loading branch information
Тихомиров Дмитрий Алексеевич committed Oct 5, 2023
2 parents d8e3796 + 197a952 commit 7b8e49e
Show file tree
Hide file tree
Showing 8 changed files with 2,432 additions and 30 deletions.
1,592 changes: 1,592 additions & 0 deletions Tutorial_13_ABtesting.ipynb

Large diffs are not rendered by default.

Empty file.
572 changes: 572 additions & 0 deletions lightautoml/addons/hypex/ABTesting/ab_tester.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions lightautoml/addons/hypex/algorithms/faiss_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,8 +491,8 @@ def matching_quality(self, df_matched) -> Dict[str, Union[Dict[str, float], floa
else:
logger.info("Estimating quality of matching")

psi_columns = self.columns_match
psi_columns.remove(self.treatment)
psi_columns = set(self.columns_match)
psi_columns = list(psi_columns - set([self.treatment] + self.outcomes))
psi_data, ks_data, smd_data = matching_quality(
df_matched, self.treatment, sorted(self.features_quality), sorted(psi_columns), self.silent
)
Expand Down
20 changes: 11 additions & 9 deletions lightautoml/addons/hypex/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def _preprocessing_data(self):
"""Converts categorical features into dummy variables."""
info_col = self.info_col if self.info_col is not None else []
group_col = [self.group_col] if self.group_col is not None else []
columns_to_drop = info_col + group_col + self.outcomes
columns_to_drop = info_col + group_col + self.outcomes + [self.treatment]
if self.base_filtration:
filtered_features = nan_filtration(self.input_data.drop(columns=columns_to_drop))
self.dropped_features = [f for f in self.input_data.columns if f not in filtered_features + columns_to_drop]
Expand Down Expand Up @@ -314,22 +314,24 @@ def match_no_rep(self, threshold: float = 0.1) -> pd.DataFrame:
X = X.drop(columns=self.info_col)

index_matched = MatcherNoReplacement(X, a, self.weights).match()
index_matched = np.concatenate(index_matched.loc[1].iloc[self.input_data[a == 1].index].matches.values)
filtred_matches = index_matched.loc[1].iloc[self.input_data[a == 1].index].matches[index_matched.loc[1].iloc[self.input_data[a == 1].index].matches.apply(lambda x: x != [])]

if self.weights is not None:
weighted_features = [f for f in self.weights.keys()]
index_dict = dict()
for w in weighted_features:
source = self.input_data.loc[index_matched][w].values
target = self.input_data[a == 1][w].values
source = self.input_data.loc[np.concatenate(filtred_matches.values)][w].values
target = self.input_data.loc[filtred_matches.index.to_list()][w].values
index = abs(source - target) <= abs(source) * threshold
index_dict.update({w: index})
index_filtered = sum(index_dict.values()) == len(self.weights)
matched_data = pd.concat(
[self.input_data[a == 1].iloc[index_filtered], self.input_data.loc[index_matched].iloc[index_filtered]]
[self.input_data.loc[filtred_matches.index.to_list()].iloc[index_filtered],
self.input_data.loc[np.concatenate(filtred_matches.values)].iloc[index_filtered]]
)
else:
matched_data = pd.concat([self.input_data[a == 1], self.input_data.loc[index_matched]])
matched_data = pd.concat([self.input_data.loc[filtred_matches.index.to_list()],
self.input_data.loc[np.concatenate(filtred_matches.values)]])
return matched_data

def lama_feature_select(self) -> pd.DataFrame:
Expand Down Expand Up @@ -440,11 +442,11 @@ def validate_result(
Validates estimated effect:
1) by replacing real treatment with random placebo treatment.
Estimated effect must be droped to zero, p-val < 0.05;
Estimated effect must be droped to zero, p-val > 0.05;
2) by adding random feature (`random_feature`). Estimated effect shouldn't change
significantly, p-val > 0.05;
significantly, p-val < 0.05;
3) estimates effect on subset of data (default fraction is 0.8). Estimated effect
shouldn't change significantly, p-val > 0.05.
shouldn't change significantly, p-val < 0.05.
Args:
refuter:
Expand Down
78 changes: 78 additions & 0 deletions lightautoml/addons/hypex/tests/test_aa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pandas as pd
from lightautoml.addons.hypex.ABTesting.ab_tester import AATest
from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data


def test_aa_simple():
data = create_test_data(rs=52)
info_col = "user_id"
iterations = 20

model = AATest(
data=data,
target_fields=["pre_spends", "post_spends"],
info_cols=info_col
)
res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations)

assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \
"(#rows should be equal #of experiments"
assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess"
assert isinstance(datas_dict, dict), "Result is not dict"
assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), \
"Columns in the result are not the same as columns in initial data "


def test_aa_group():
data = create_test_data(rs=52)
info_col = "user_id"
group_cols = 'industry'
iterations = 20

model = AATest(
data=data,
target_fields=["pre_spends", "post_spends"],
info_cols=info_col,
group_cols=group_cols
)
res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations)

assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \
"(#rows should be equal #of experiments"
assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess"
assert isinstance(datas_dict, dict), "Result is not dict"
assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \
"the same as columns in initial " \
"data "


def test_aa_quantfields():
data = create_test_data(rs=52)
info_col = "user_id"
group_cols = 'industry'
quant_field = 'gender'
iterations = 20

model = AATest(
data=data,
target_fields=["pre_spends", "post_spends"],
info_cols=info_col,
group_cols=group_cols,
quant_field=quant_field
)
res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations)

assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \
"(#rows should be equal #of experiments"
assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess"
assert isinstance(datas_dict, dict), "Result is not dict"
assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \
"the same as columns in initial " \
"data "

69 changes: 69 additions & 0 deletions lightautoml/addons/hypex/tests/test_ab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from lightautoml.addons.hypex.ABTesting.ab_tester import ABTest
from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data


# def test_split_ab():
# data = create_test_data()
# half_data = int(data.shape[0] / 2)
# data['group'] = ['test'] * half_data + ['control'] * half_data
#
# group_field = 'group'
#
# model = ABTest()
# splitted_data = model.split_ab(data, group_field)
#
# assert isinstance(splitted_data, dict), "result of split_ab is not dict"
# assert len(splitted_data) == 2, "split_ab contains not of 2 values"
# assert list(splitted_data.keys()) == ['test', 'control'], "changed keys in result of split_ab"
#
#
# def test_calc_difference():
# data = create_test_data()
# half_data = int(data.shape[0] / 2)
# data['group'] = ['test'] * half_data + ['control'] * half_data
#
# group_field = 'group'
# target_field = 'post_spends'
#
# model = ABTest()
# splitted_data = model.split_ab(data, group_field)
# differences = model.calc_difference(splitted_data, target_field)
#
# assert isinstance(differences, dict), "result of calc_difference is not dict"


def test_calc_p_value():
data = create_test_data()
half_data = int(data.shape[0] / 2)
data['group'] = ['test'] * half_data + ['control'] * half_data

group_field = 'group'
target_field = 'post_spends'

model = ABTest()
splitted_data = model.split_ab(data, group_field)
pvalues = model.calc_p_value(splitted_data, target_field)

assert isinstance(pvalues, dict), "result of calc_p_value is not dict"


def test_execute():
data = create_test_data()
half_data = int(data.shape[0] / 2)
data['group'] = ['test'] * half_data + ['control'] * half_data

target_field = 'post_spends'
target_field_before = 'pre_spends'
group_field = 'group'

model = ABTest()
result = model.execute(
data=data,
target_field=target_field,
target_field_before=target_field_before,
group_field=group_field
)

assert isinstance(result, dict), "result of func execution is not dict"
assert len(result) == 3, "result of execution is changed, len of dict was 3"
assert list(result.keys()) == ['size', 'difference', 'p_value']
127 changes: 108 additions & 19 deletions lightautoml/addons/hypex/utils/tutorial_data_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,104 @@
import pandas as pd
import sys
from pathlib import Path
from typing import Iterable, Union

ROOT = Path('.').absolute().parents[0]
sys.path.append(str(ROOT))


def create_test_data(num_users: int = 10000, file_name: str = None):
def set_nans(
data: pd.DataFrame,
na_step: Union[Iterable[int], int] = None,
nan_cols: Union[Iterable[str], str] = None
):
"""Fill some values with NaN/
Args:
data: input dataframe
na_step:
num or list of nums of period to make NaN (step of range)
If list - iterates accordingly order of columns
nan_cols:
name of one or several columns to fill with NaN
If list - iterates accordingly order of na_step
Returns:
data: dataframe with some NaNs
"""
if (nan_cols is not None) or (na_step is not None):
# correct type of columns to iterate

# number of nans
if na_step is None:
na_step = [10]
print(f'No na_step specified: set to {na_step}')
elif not isinstance(na_step, Iterable):
na_step = [na_step]

# columns
if nan_cols is None:
nan_cols = list(data.columns)
print('No nan_cols specified. Setting NaNs applied to all columns')
elif not isinstance(nan_cols, Iterable):
nan_cols = [nan_cols]

# correct length of two lists
if len(na_step) > len(nan_cols):
na_step = na_step[:len(nan_cols)]
print('Length of na_step is bigger than length of columns. Used only first values')
elif len(na_step) < len(nan_cols):
na_step = na_step + [na_step[-1]] * (len(nan_cols) - len(na_step))
print('Length of na_step is less than length of columns. Used last value several times')

# create list of indexes to fill with na
nans_indexes = [list(range(i, len(data), period)) for i, period in enumerate(na_step)]

for i in range(len(nan_cols)):
try:
data.loc[nans_indexes[i], nan_cols[i]] = np.nan
except KeyError:
print(f'There is no column {nan_cols[i]} in data. No nans in this column will be added.')
else:
print('No NaN added')

return data


def create_test_data(
num_users: int = 10000,
na_step: Union[Iterable[int], int] = None,
nan_cols: Union[Iterable[str], str] = None,
file_name: str = None,
rs = None
):
"""Creates data for tutorial.
Args:
num_users: num of strings
na_step:
num or list of nums of period to make NaN (step of range)
If list - iterates accordingly order of columns
nan_cols:
name of one or several columns to fill with NaN
If list - iterates accordingly order of na_step
file_name: name of file to save; doesn't save file if None
Returns:
data: dataframe with
"""
if rs is not None:
np.random.seed(rs)

if (nan_cols is not None) and isinstance(nan_cols, str):
nan_cols = [nan_cols]
# Simulating dataset with known effect size
num_months = 12

# signup_months == 0 means customer did not sign up
signup_months = np.random.choice(np.arange(1, num_months), num_users) * np.random.randint(0, 2, size=num_users)

df = pd.DataFrame(
data = pd.DataFrame(
{
"user_id": np.repeat(np.arange(num_users), num_months),
"signup_month": np.repeat(signup_months, num_months), # signup month == 0 means customer did not sign up
Expand All @@ -23,19 +109,19 @@ def create_test_data(num_users: int = 10000, file_name: str = None):
)

# A customer is in the treatment group if and only if they signed up
df["treat"] = df["signup_month"] > 0
data["treat"] = data["signup_month"] > 0

# Simulating an effect of month (monotonically decreasing--customers buy less later in the year)
df["spend"] = df["spend"] - df["month"] * 10
data["spend"] = data["spend"] - data["month"] * 10

# Simulating a simple treatment effect of 100
after_signup = (df["signup_month"] < df["month"]) & (df["treat"])
df.loc[after_signup, "spend"] = df[after_signup]["spend"] + 100
after_signup = (data["signup_month"] < data["month"]) & (data["treat"])
data.loc[after_signup, "spend"] = data[after_signup]["spend"] + 100

# Setting the signup month (for ease of analysis)
i = 3
df_i_signupmonth = (
df[df.signup_month.isin([0, i])]
data = (
data[data.signup_month.isin([0, i])]
.groupby(["user_id", "signup_month", "treat"])
.apply(
lambda x: pd.Series(
Expand All @@ -46,25 +132,28 @@ def create_test_data(num_users: int = 10000, file_name: str = None):
)

# Additional category features
gender_i = np.random.choice(a=[0, 1], size=df_i_signupmonth.user_id.nunique())
gender_i = np.random.choice(a=[0, 1], size=data.user_id.nunique())
gender = [["M", "F"][i] for i in gender_i]

age = np.random.choice(a=range(18, 70), size=df_i_signupmonth.user_id.nunique())
age = np.random.choice(a=range(18, 70), size=data.user_id.nunique())

industry_i = np.random.choice(a=range(1, 3), size=df_i_signupmonth.user_id.nunique())
industry_i = np.random.choice(a=range(1, 3), size=data.user_id.nunique())
industry_names = ["Finance", "E-commerce", "Logistics"]
industry = [industry_names[i] for i in industry_i]

df_i_signupmonth["age"] = age
df_i_signupmonth["gender"] = gender
df_i_signupmonth["industry"] = industry
df_i_signupmonth["industry"] = df_i_signupmonth["industry"].astype("str")
df_i_signupmonth["treat"] = df_i_signupmonth["treat"].astype(int)
data["age"] = age
data["gender"] = gender
data["industry"] = industry
data["industry"] = data["industry"].astype("str")
data["treat"] = data["treat"].astype(int)

# input nans in data if needed
data = set_nans(data, na_step, nan_cols)

if file_name is not None:
df_i_signupmonth.to_csv(ROOT / f"{file_name}.csv", index=False)
data.to_csv(ROOT / f"{file_name}.csv", index=False)

return df_i_signupmonth
return data


create_test_data(num_users=10_000, file_name="Tutorial_data")
# create_test_data(num_users=10_000, file_name="Tutorial_data")

0 comments on commit 7b8e49e

Please sign in to comment.