Merge branch 'sigmamaster'

# Conflicts: # dist/lightautoml-0.3.8b1-py3-none-any.whl
sb-ai-lab · Nov 2, 2023 · dffed94 · dffed94
2 parents 43b1733 + 1b8fe03
commit dffed94
Show file tree

Hide file tree

Showing 8 changed files with 2,053 additions and 440 deletions.
diff --git a/Tutorial_13_ABtesting.ipynb b/Tutorial_13_ABtesting.ipynb
diff --git a/examples/tutorials/Tutorial_12_Matching.ipynb b/examples/tutorials/Tutorial_12_Matching.ipynb
diff --git a/lightautoml/addons/hypex/ABTesting/ab_tester.py b/lightautoml/addons/hypex/ABTesting/ab_tester.py
diff --git a/lightautoml/addons/hypex/__init__.py b/lightautoml/addons/hypex/__init__.py
@@ -2,4 +2,4 @@
 from .matcher import Matcher
 
 
-__all__ = ["Matcher"]
+__all__ = ["Matcher"]
diff --git a/lightautoml/addons/hypex/tests/__init__.py b/lightautoml/addons/hypex/tests/__init__.py
@@ -1,3 +1,3 @@
 from ..matcher import Matcher
 
-__all__ = ["Matcher"]
+__all__ = ["Matcher"]
diff --git a/lightautoml/addons/hypex/tests/test_aa.py b/lightautoml/addons/hypex/tests/test_aa.py
@@ -1,78 +1,72 @@
 import pandas as pd
+import pytest
+
 from lightautoml.addons.hypex.ABTesting.ab_tester import AATest
 from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data
 
 
-def test_aa_simple():
-    data = create_test_data(rs=52)
-    info_col = "user_id"
-    iterations = 20
+@pytest.fixture
+def data():
+    return create_test_data(rs=52)
 
-    model = AATest(
-        data=data,
-        target_fields=["pre_spends", "post_spends"],
-        info_cols=info_col
-    )
-    res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations)
+
+@pytest.fixture
+def iterations():
+    return 20
+
+
+@pytest.fixture
+def info_col():
+    return "user_id"
+
+
+def test_aa_simple(data, iterations, info_col):
+    model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col)
+    res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
 
     assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
-    assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \
-                                       "(#rows should be equal #of experiments"
-    assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess"
+    assert res.shape[0] == iterations, (
+        "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
+    )
     assert isinstance(datas_dict, dict), "Result is not dict"
     assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
-    assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), \
-        "Columns in the result are not the same as columns in initial data "
+    assert all(data.columns) == all(
+        datas_dict[0].drop(columns=["group"]).columns
+    ), "Columns in the result are not the same as columns in initial data "
 
 
-def test_aa_group():
-    data = create_test_data(rs=52)
-    info_col = "user_id"
-    group_cols = 'industry'
-    iterations = 20
+def test_aa_group(data, iterations, info_col):
+    group_cols = "industry"
 
-    model = AATest(
-        data=data,
-        target_fields=["pre_spends", "post_spends"],
-        info_cols=info_col,
-        group_cols=group_cols
-    )
-    res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations)
+    model = AATest(target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols)
+    res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
 
     assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
-    assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \
-                                       "(#rows should be equal #of experiments"
-    assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess"
+    assert res.shape[0] == iterations, (
+        "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
+    )
     assert isinstance(datas_dict, dict), "Result is not dict"
     assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
-    assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \
-                                                                                    "the same as columns in initial " \
-                                                                                    "data "
+    assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
+        "Columns in the result are not " "the same as columns in initial " "data "
+    )
 
 
-def test_aa_quantfields():
-    data = create_test_data(rs=52)
-    info_col = "user_id"
-    group_cols = 'industry'
-    quant_field = 'gender'
-    iterations = 20
+def test_aa_quantfields(data, iterations, info_col):
+    group_cols = "industry"
+    quant_field = "gender"
 
     model = AATest(
-        data=data,
-        target_fields=["pre_spends", "post_spends"],
-        info_cols=info_col,
-        group_cols=group_cols,
-        quant_field=quant_field
+        target_fields=["pre_spends", "post_spends"], info_cols=info_col, group_cols=group_cols, quant_field=quant_field
     )
-    res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations)
+    res, datas_dict = model.search_dist_uniform_sampling(data, iterations=iterations)
 
     assert isinstance(res, pd.DataFrame), "Metrics are not dataframes"
-    assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \
-                                       "(#rows should be equal #of experiments"
-    assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess"
+    assert res.shape[0] == iterations, (
+        "Metrics dataframe contains more or less rows with random states " "(#rows should be equal #of experiments"
+    )
     assert isinstance(datas_dict, dict), "Result is not dict"
     assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations"
-    assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \
-                                                                                    "the same as columns in initial " \
-                                                                                    "data "
-
+    assert all(data.columns) == all(datas_dict[0].drop(columns=["group"]).columns), (
+        "Columns in the result are not " "the same as columns in initial " "data "
+    )
diff --git a/lightautoml/addons/hypex/tests/test_ab.py b/lightautoml/addons/hypex/tests/test_ab.py
@@ -1,69 +1,92 @@
 from lightautoml.addons.hypex.ABTesting.ab_tester import ABTest
-from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data
-
-
-# def test_split_ab():
-#     data = create_test_data()
-#     half_data = int(data.shape[0] / 2)
-#     data['group'] = ['test'] * half_data + ['control'] * half_data
-#
-#     group_field = 'group'
-#
-#     model = ABTest()
-#     splitted_data = model.split_ab(data, group_field)
-#
-#     assert isinstance(splitted_data, dict), "result of split_ab is not dict"
-#     assert len(splitted_data) == 2, "split_ab contains not of 2 values"
-#     assert list(splitted_data.keys()) == ['test', 'control'], "changed keys in result of split_ab"
-#
-#
-# def test_calc_difference():
-#     data = create_test_data()
-#     half_data = int(data.shape[0] / 2)
-#     data['group'] = ['test'] * half_data + ['control'] * half_data
-#
-#     group_field = 'group'
-#     target_field = 'post_spends'
-#
-#     model = ABTest()
-#     splitted_data = model.split_ab(data, group_field)
-#     differences = model.calc_difference(splitted_data, target_field)
-#
-#     assert isinstance(differences, dict), "result of calc_difference is not dict"
-
-
-def test_calc_p_value():
-    data = create_test_data()
-    half_data = int(data.shape[0] / 2)
-    data['group'] = ['test'] * half_data + ['control'] * half_data
-
-    group_field = 'group'
-    target_field = 'post_spends'
-
-    model = ABTest()
-    splitted_data = model.split_ab(data, group_field)
-    pvalues = model.calc_p_value(splitted_data, target_field)
-
-    assert isinstance(pvalues, dict), "result of calc_p_value is not dict"
-
-
-def test_execute():
-    data = create_test_data()
-    half_data = int(data.shape[0] / 2)
-    data['group'] = ['test'] * half_data + ['control'] * half_data
-
-    target_field = 'post_spends'
-    target_field_before = 'pre_spends'
-    group_field = 'group'
-
-    model = ABTest()
-    result = model.execute(
-        data=data,
-        target_field=target_field,
-        target_field_before=target_field_before,
-        group_field=group_field
+
+import pytest
+import pandas as pd
+import numpy as np
+
+DATA_SIZE = 100
+
+
+@pytest.fixture
+def ab_test():
+    return ABTest()
+
+
+@pytest.fixture
+def data():
+    # Generate synthetic data for group A
+    group_a_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE)
+    # Generate synthetic data for group B
+    group_b_data = np.random.normal(loc=12, scale=2, size=DATA_SIZE)
+    group_bp_data = np.random.normal(loc=10, scale=2, size=DATA_SIZE * 2)
+    return pd.DataFrame(
+        {
+            "group": ["control"] * len(group_a_data) + ["test"] * len(group_b_data),
+            "value": list(group_a_data) + list(group_b_data),
+            "previous_value": group_bp_data,
+        }
     )
 
-    assert isinstance(result, dict), "result of func execution is not dict"
-    assert len(result) == 3, "result of execution is changed, len of dict was 3"
-    assert list(result.keys()) == ['size', 'difference', 'p_value']
+
+@pytest.fixture
+def target_field():
+    return "value"
+
+
+@pytest.fixture
+def group_field():
+    return "group"
+
+
+@pytest.fixture
+def previous_value():
+    return "previous_value"
+
+
+@pytest.fixture
+def alpha():
+    return 0.05
+
+
+def test_split_ab(ab_test, data, group_field):
+    result = ab_test.split_ab(data, group_field)
+    assert len(result["test"]) == DATA_SIZE
+    assert len(result["control"]) == DATA_SIZE
+
+
+def test_calc_difference(ab_test, data, group_field, target_field, previous_value):
+    splitted_data = ab_test.split_ab(data, group_field)
+    result = ab_test.calc_difference(splitted_data, target_field, previous_value)
+    assert 1 < result["ate"] < 3
+    assert 1 < result["cuped"] < 3
+    assert 1 < result["diff_in_diff"] < 3
+
+
+def test_calc_difference_with_previous_value(ab_test, data, group_field, target_field, previous_value):
+    ab_test.calc_difference_method = "ate"
+    splitted_data = ab_test.split_ab(data, group_field)
+    result = ab_test.calc_difference(splitted_data, previous_value)
+    assert -1 < result["ate"] < 1
+
+
+def test_calc_p_value(ab_test, data, group_field, target_field, previous_value, alpha):
+    splitted_data = ab_test.split_ab(data, group_field)
+    result = ab_test.calc_p_value(splitted_data, target_field)
+    assert result["t_test"] < alpha
+    assert result["mann_whitney"] < alpha
+
+    result = ab_test.calc_p_value(splitted_data, previous_value)
+    assert result["t_test"] > alpha
+    assert result["mann_whitney"] > alpha
+
+
+def test_execute(ab_test, data, group_field, target_field, previous_value, alpha):
+    result = ab_test.execute(data, target_field, group_field, previous_value)
+    print(result)
+    assert result["size"]["test"] == DATA_SIZE
+    assert result["size"]["control"] == DATA_SIZE
+    assert 1 < result["difference"]["ate"] < 3
+    assert 1 < result["difference"]["cuped"] < 3
+    assert 1 < result["difference"]["diff_in_diff"] < 3
+    assert result["p_value"]["t_test"] < alpha
+    assert result["p_value"]["mann_whitney"] < alpha
diff --git a/lightautoml/addons/hypex/tests/test_matcher.py b/lightautoml/addons/hypex/tests/test_matcher.py
@@ -5,7 +5,7 @@
 from lightautoml.addons.hypex import Matcher
 from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data
 
-ROOT = Path('.').absolute().parents[0]
+ROOT = Path(".").absolute().parents[0]
 sys.path.append(str(ROOT))
 
 
@@ -38,7 +38,7 @@ def test_matcher_pos():
         "p-val",
         "ci_lower",
         "ci_upper",
-        "post_spends"
+        "post_spends",
     ], "format of results is changed: columns in report"
     assert model.results["p-val"].values[0] <= 0.05, "p-value on ATE is greater than 0.1"
     assert model.results["p-val"].values[1] <= 0.05, "p-value on ATC is greater than 0.1"
@@ -71,7 +71,7 @@ def test_matcher_group_pos():
         "p-val",
         "ci_lower",
         "ci_upper",
-        "post_spends"
+        "post_spends",
     ], "format of results is changed: columns in report ['effect_size', 'std_err', 'p-val', 'ci_lower', 'ci_upper']"
     assert model.results["p-val"].values[0] <= 0.05, "p-value on ATE is greater than 0.1"
     assert model.results["p-val"].values[1] <= 0.05, "p-value on ATC is greater than 0.1"