Skip to content

Commit

Permalink
Merge branch 'sigmamaster'
Browse files Browse the repository at this point in the history
# Conflicts:
#	dist/lightautoml-0.3.8b1-py3-none-any.whl
  • Loading branch information
Тихомиров Дмитрий Алексеевич committed Nov 2, 2023
2 parents 43b1733 + 1b8fe03 commit dffed94
Show file tree
Hide file tree
Showing 8 changed files with 2,053 additions and 440 deletions.
768 changes: 648 additions & 120 deletions Tutorial_13_ABtesting.ipynb

Large diffs are not rendered by default.

1,195 changes: 1,129 additions & 66 deletions examples/tutorials/Tutorial_12_Matching.ipynb

Large diffs are not rendered by default.

267 changes: 136 additions & 131 deletions lightautoml/addons/hypex/ABTesting/ab_tester.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lightautoml/addons/hypex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
from .matcher import Matcher


__all__ = ["Matcher"]
__all__ = ["Matcher"]
2 changes: 1 addition & 1 deletion lightautoml/addons/hypex/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from ..matcher import Matcher

__all__ = ["Matcher"]
__all__ = ["Matcher"]
98 changes: 46 additions & 52 deletions lightautoml/addons/hypex/tests/test_aa.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,72 @@
import pandas as pd
import pytest

from lightautoml.addons.hypex.ABTesting.ab_tester import AATest
from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data


def test_aa_simple():
data = create_test_data(rs=52)
info_col = "user_id"
iterations = 20
@pytest.fixture
def data():
return create_test_data(rs=52)

model = AATest(
data=data,
target_fields=["pre_spends", "post_spends"],
info_cols=info_col
)
res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations)

@pytest.fixture
def iterations():
return 20


@pytest.fixture
def info_col():
return "user_id"


def test_aa_simple(data, iterations, info_col):
model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col)
res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)

assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \
"(#rows should be equal #of experiments"
assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess"
assert res.shape[0] == iterations, (
"Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
)
assert isinstance(datas_dict, dict), "Result is not dict"
assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), \
"Columns in the result are not the same as columns in initial data "
assert all(data.columns) == all(
datas_dict[0].drop(columns=["group"]).columns
), "Columns in the result are not the same as columns in initial data "


def test_aa_group():
data = create_test_data(rs=52)
info_col = "user_id"
group_cols = 'industry'
iterations = 20
def test_aa_group(data, iterations, info_col):
group_cols = "industry"

model = AATest(
data=data,
target_fields=["pre_spends", "post_spends"],
info_cols=info_col,
group_cols=group_cols
)
res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations)
model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols)
res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)

assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \
"(#rows should be equal #of experiments"
assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess"
assert res.shape[0] == iterations, (
"Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
)
assert isinstance(datas_dict, dict), "Result is not dict"
assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \
"the same as columns in initial " \
"data "
assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
"Columns in the result are not " "the same as columns in initial " "data "
)


def test_aa_quantfields():
data = create_test_data(rs=52)
info_col = "user_id"
group_cols = 'industry'
quant_field = 'gender'
iterations = 20
def test_aa_quantfields(data, iterations, info_col):
group_cols = "industry"
quant_field = "gender"

model = AATest(
data=data,
target_fields=["pre_spends", "post_spends"],
info_cols=info_col,
group_cols=group_cols,
quant_field=quant_field
target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols, quant_field=quant_field
)
res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations)
res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)

assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \
"(#rows should be equal #of experiments"
assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess"
assert res.shape[0] == iterations, (
"Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
)
assert isinstance(datas_dict, dict), "Result is not dict"
assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \
"the same as columns in initial " \
"data "

assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
"Columns in the result are not " "the same as columns in initial " "data "
)
155 changes: 89 additions & 66 deletions lightautoml/addons/hypex/tests/test_ab.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,92 @@
from lightautoml.addons.hypex.ABTesting.ab_tester import ABTest
from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data


# def test_split_ab():
# data = create_test_data()
# half_data = int(data.shape[0] / 2)
# data['group'] = ['test'] * half_data + ['control'] * half_data
#
# group_field = 'group'
#
# model = ABTest()
# splitted_data = model.split_ab(data, group_field)
#
# assert isinstance(splitted_data, dict), "result of split_ab is not dict"
# assert len(splitted_data) == 2, "split_ab contains not of 2 values"
# assert list(splitted_data.keys()) == ['test', 'control'], "changed keys in result of split_ab"
#
#
# def test_calc_difference():
# data = create_test_data()
# half_data = int(data.shape[0] / 2)
# data['group'] = ['test'] * half_data + ['control'] * half_data
#
# group_field = 'group'
# target_field = 'post_spends'
#
# model = ABTest()
# splitted_data = model.split_ab(data, group_field)
# differences = model.calc_difference(splitted_data, target_field)
#
# assert isinstance(differences, dict), "result of calc_difference is not dict"


def test_calc_p_value():
data = create_test_data()
half_data = int(data.shape[0] / 2)
data['group'] = ['test'] * half_data + ['control'] * half_data

group_field = 'group'
target_field = 'post_spends'

model = ABTest()
splitted_data = model.split_ab(data, group_field)
pvalues = model.calc_p_value(splitted_data, target_field)

assert isinstance(pvalues, dict), "result of calc_p_value is not dict"


def test_execute():
data = create_test_data()
half_data = int(data.shape[0] / 2)
data['group'] = ['test'] * half_data + ['control'] * half_data

target_field = 'post_spends'
target_field_before = 'pre_spends'
group_field = 'group'

model = ABTest()
result = model.execute(
data=data,
target_field=target_field,
target_field_before=target_field_before,
group_field=group_field

import pytest
import pandas as pd
import numpy as np

DATA_SIZE = 100


@pytest.fixture
def ab_test():
return ABTest()


@pytest.fixture
def data():
# Generate synthetic data for group A
group_a_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE)
# Generate synthetic data for group B
group_b_data = np.random.normal(loc=12, scale=2, size=DATA_SIZE)
group_bp_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE * 2)
return pd.DataFrame(
{
"group": ["control"] * len(group_a_data) + ["test"] * len(group_b_data),
"value": list(group_a_data) + list(group_b_data),
"previous_value": group_bp_data,
}
)

assert isinstance(result, dict), "result of func execution is not dict"
assert len(result) == 3, "result of execution is changed, len of dict was 3"
assert list(result.keys()) == ['size', 'difference', 'p_value']

@pytest.fixture
def target_field():
return "value"


@pytest.fixture
def group_field():
return "group"


@pytest.fixture
def previous_value():
return "previous_value"


@pytest.fixture
def alpha():
return 0.05


def test_split_ab(ab_test, data, group_field):
result = ab_test.split_ab(data, group_field)
assert len(result["test"]) == DATA_SIZE
assert len(result["control"]) == DATA_SIZE


def test_calc_difference(ab_test, data, group_field, target_field, previous_value):
splitted_data = ab_test.split_ab(data, group_field)
result = ab_test.calc_difference(splitted_data, target_field, previous_value)
assert 1 < result["ate"] < 3
assert 1 < result["cuped"] < 3
assert 1 < result["diff_in_diff"] < 3


def test_calc_difference_with_previous_value(ab_test, data, group_field, target_field, previous_value):
ab_test.calc_difference_method = "ate"
splitted_data = ab_test.split_ab(data, group_field)
result = ab_test.calc_difference(splitted_data, previous_value)
assert -1 < result["ate"] < 1


def test_calc_p_value(ab_test, data, group_field, target_field, previous_value, alpha):
splitted_data = ab_test.split_ab(data, group_field)
result = ab_test.calc_p_value(splitted_data, target_field)
assert result["t_test"] < alpha
assert result["mann_whitney"] < alpha

result = ab_test.calc_p_value(splitted_data, previous_value)
assert result["t_test"] > alpha
assert result["mann_whitney"] > alpha


def test_execute(ab_test, data, group_field, target_field, previous_value, alpha):
result = ab_test.execute(data, target_field, group_field, previous_value)
print(result)
assert result["size"]["test"] == DATA_SIZE
assert result["size"]["control"] == DATA_SIZE
assert 1 < result["difference"]["ate"] < 3
assert 1 < result["difference"]["cuped"] < 3
assert 1 < result["difference"]["diff_in_diff"] < 3
assert result["p_value"]["t_test"] < alpha
assert result["p_value"]["mann_whitney"] < alpha
6 changes: 3 additions & 3 deletions lightautoml/addons/hypex/tests/test_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from lightautoml.addons.hypex import Matcher
from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data

ROOT = Path('.').absolute().parents[0]
ROOT = Path(".").absolute().parents[0]
sys.path.append(str(ROOT))


Expand Down Expand Up @@ -38,7 +38,7 @@ def test_matcher_pos():
"p-val",
"ci_lower",
"ci_upper",
"post_spends"
"post_spends",
], "format of results is changed: columns in report"
assert model.results["p-val"].values[0] <= 0.05, "p-value on ATE is greater than 0.1"
assert model.results["p-val"].values[1] <= 0.05, "p-value on ATC is greater than 0.1"
Expand Down Expand Up @@ -71,7 +71,7 @@ def test_matcher_group_pos():
"p-val",
"ci_lower",
"ci_upper",
"post_spends"
"post_spends",
], "format of results is changed: columns in report ['effect_size', 'std_err', 'p-val', 'ci_lower', 'ci_upper']"
assert model.results["p-val"].values[0] <= 0.05, "p-value on ATE is greater than 0.1"
assert model.results["p-val"].values[1] <= 0.05, "p-value on ATC is greater than 0.1"
Expand Down

0 comments on commit dffed94

Please sign in to comment.