From 309e0942a0104b2174795de5826a43b23767447a Mon Sep 17 00:00:00 2001 From: 20730989 Date: Wed, 2 Aug 2023 16:42:03 +0300 Subject: [PATCH 01/20] ab_spliter init --- lightautoml/addons/hypex/ab_spliter.py | 410 +++++++++++++++++++++++++ 1 file changed, 410 insertions(+) create mode 100644 lightautoml/addons/hypex/ab_spliter.py diff --git a/lightautoml/addons/hypex/ab_spliter.py b/lightautoml/addons/hypex/ab_spliter.py new file mode 100644 index 00000000..13d94028 --- /dev/null +++ b/lightautoml/addons/hypex/ab_spliter.py @@ -0,0 +1,410 @@ +# v: 0.2.1a +import warnings +from abc import ABC, abstractmethod +from pathlib import Path +from sklearn.utils import shuffle +from typing import Iterable, Union, Optional, List, Dict + +from tqdm.auto import tqdm + +import pandas as pd +import numpy as np +import scipy.stats as stats +from scipy.stats import norm, ttest_ind, ks_2samp + + +def calc_mde( + test_group: pd.Series, + control_group: pd.Series, + reliability: float = 0.95, + power: float = 0.8, +) -> float: + """ + Minimum detectable effect + Минимальный эффект, который можно статистически обосновать при сравнении двух групп + Методика рачета описана в Campaign Perfomance Management – Методика A/B-тестирования v.2.0 (стр.29) + + :param test_group: целевая группа + :param control_group: контрольная группа + :param reliability: уровень статистической достоверности. Обычно равен 0.95 + :param power: мощность статистического критерия + :return: минимально детерминируемый эффект + """ + m = stats.norm.ppf((1 + reliability) / 2) + stats.norm.ppf(power) + + n_test, n_control = len(test_group), len(control_group) + proportion = n_test / (n_test + n_control) + p = np.sqrt(1 / (proportion * (1 - proportion))) + + var_test, var_control = test_group.var(ddof=1), control_group.var(ddof=1) + s = np.sqrt((var_test / n_test) + (var_control / n_control)) + + return m * p * s + + +def calc_sample_size( + test_group: pd.Series, + control_group: pd.Series, + mde, + significance: float = 0.05, + power: float = 0.8, +) -> float: + """ + Минимально требуемуе количество объектов тестирования в общей группе + Методика расчета описана в Campaign Perfomance Management – Методика A/B-тестирования v.2.0 (стр.14) + + :param test_group: целевая группа + :param control_group: контрольная группа + :param mde: минимальное детерменируемое значение + :param significance: уровень статистической значимости - вероятность ошибки первого рода (обычно 0.05) + :param power: мощность статистического критерия + :return: минимальный размер общей группы + """ + test_std = test_group.std() + control_std = control_group.std() + + test_proportion = len(test_group) / (len(test_group) + len(control_group)) + control_proportion = 1 - test_proportion + + d = ((norm.ppf(1 - significance / 2) + norm.ppf(power)) / mde) ** 2 + + s = test_std ** 2 / test_proportion + control_std ** 2 / control_proportion + + return d * s + + +# --------------------- Classes --------------------- +class ABSplitter: + """ + Класс разделителя на A|B группы + """ + + def __init__(self, mode="simple", by_group=None, quant_field=None): + self.mode = mode + self.by_group = by_group + self.quant_field = quant_field + + @staticmethod + def merge_groups( + test_group: Union[Iterable[pd.DataFrame], pd.DataFrame], + control_group: Union[Iterable[pd.DataFrame], pd.DataFrame], + ): + """ + Объединяет test и control в один df + + :return: объединенный датафрейм + """ + if not ( + isinstance(test_group, pd.DataFrame) + and isinstance(test_group, pd.DataFrame) + ): + test_group = pd.concat(test_group, ignore_index=True) + control_group = pd.concat(control_group, ignore_index=True) + + test_group.loc[:, "group"] = "test" + control_group.loc[:, "group"] = "control" + return pd.concat([test_group, control_group], ignore_index=True) + + def __simple_mode(self, data, random_state): + result = { + "test_indexes": [], + "control_indexes": [] + } + + if self.quant_field: + random_ids = shuffle(data[self.quant_field].unique(), random_state=random_state) + edge = len(random_ids) // 2 + result["test_indexes"] = list(data[data[self.quant_field].isin(random_ids[:edge])].index) + result["control_indexes"] = list(data[data[self.quant_field].isin(random_ids[edge:])].index) + + else: + addition_indexes = list(shuffle(data.index, random_state=random_state)) + edge = len(addition_indexes) // 2 + result["test_indexes"] = addition_indexes[:edge] + result["control_indexes"] = addition_indexes[edge:] + + return result + + def split_ab(self, data, random_state: int = None) -> Dict: + result = { + "test_indexes": [], + "control_indexes": [] + } + + if self.by_group: + groups = data.groupby() + for _, gd in groups: + if self.mode not in ("balanced", "simple"): + warnings.warn(f"Не предусмотрено режима '{self.mode}' для группового разделения. " + f"Был использован режим 'stratification'.") + self.mode = "simple" + + if self.mode == "simple": + t_result = self.__simple_mode(gd, random_state) + result["test_indexes"] += t_result["test_indexes"] + result["control_indexes"] += t_result["control_indexes"] + + elif self.mode == "balanced": + if self.quant_field: + random_ids = shuffle(gd[self.quant_field].unique(), random_state=random_state) + addition_indexes = list(gd[gd[self.quant_field].isin(random_ids)].index) + else: + addition_indexes = list(shuffle(gd.index, random_state=random_state)) + + if len(result["control_indexes"]) > len(result["test_indexes"]): + result["test_indexes"] += addition_indexes + else: + result["control_indexes"] += addition_indexes + + else: + if self.mode != "simple": + warnings.warn(f"Не предусмотрено режима '{self.mode}' для обычного разделения. " + f"Был использован режим 'simple'.") + + t_result = self.__simple_mode(data, random_state) + result["test_indexes"] = t_result["test_indexes"] + result["control_indexes"] = t_result["control_indexes"] + + result["test_indexes"] = list(set(result["test_indexes"])) + result["control_indexes"] = list(set(result["test_indexes"])) + return result + + def search_dist_uniform_sampling( + self, + data, + target_fields: Union[List[str], str], + n: int = None, + random_states: Iterable[int] = None, + alpha: float = 0.05, + file_name: Union[Path, str] = None, + write_mode: str = "full", + write_step: int = 10, + pbar: bool = True, + ) -> Optional[pd.DataFrame]: + """ + Подбирает random_state для поиска однородного распределения + + :param target_field: поле с целевым значением + :param n: количество итераций поиска + :param random_states: случайные состояния по которым проводится поиск (альтернатива n, если введено, то n игнорируется) + :param alpha: порог для проверки статистических гипотез + :param file_name: имя файла, в котором будет сохраняться результат (если не заполнено, функция вернет результат, а не будет сохранять его в файл) + :param write_mode: режим записи. Поддерживаются следующие: + 'full' - записывает все эксперименты + 'all' - записывает те эксперименты, которые прошли все статистические тесты + 'any' - записывает те эксперименты, которые прошли любой из статистических тестов + :param write_step: шаг записи экспериментов в файл (если не указано, пошаговая запись не используется) + :param pbar: отображать ли progress bar + :return: DataFrame, если не использовалась пошаговая запись, иначе None + """ + if random_states is None and n: + random_states = range(n) + + results = [] + + if write_mode not in ("full", "all", "any"): + warnings.warn( + f"Режим записи '{write_mode}' не поддерживается. Будет использован режим 'full'" + ) + write_mode = "full" + + if isinstance(target_fields, str): + target_fields = [target_fields] + + for i, random_state in tqdm(enumerate(random_states), total=len(random_states), display=pbar): + split = self.split_ab(data, random_state) + t_result = { + "random_state": random_state + } + a = data.loc[split["test_indexes"]] + b = data.loc[split["control_indexes"]] + scores = [] + passed = [] + + for tf in target_fields: + ta = a[tf] + tb = b[tf] + + t_result[f"{tf} a mean"] = ta.mean() + t_result[f"{tf} b mean"] = tb.mean() + t_result[f"{tf} ab mran delta %"] = (1 - t_result[f"{tf} a mean"] / t_result[f"{tf} b mean"]) * 100 + t_result[f"{tf} t_test p_value"] = ttest_ind(ta, tb).pvalue + t_result[f"{tf} ks_test p_value"] = ks_2samp(ta, tb).pvalue + t_result[f"{tf} t_test passed"] = t_result[f"{tf} t_test p_value"] > alpha + t_result[f"{tf} ks_test passed"] = t_result[f"{tf} ks_test p_value"] > alpha + scores.append((t_result[f"{tf} t_test p_value"] + t_result[f"{tf} ks_test p_value"]) / 2) + passed += [t_result[f"{tf} t_test passed"], t_result[f"{tf} ks_test passed"]] + + t_result["score"] = np.mean(scores) + + if write_mode == "all" and all(passed): + results.append(t_result) + if write_mode == "any" and any(passed): + results.append(t_result) + if write_mode == "full": + results.append(t_result) + + if file_name and write_step: + if i == write_step: + pd.DataFrame(results).to_csv(file_name, index=False) + elif i % write_step == 0: + pd.DataFrame(results).to_csv( + file_name, index=False, header=False, mode="a" + ) + results = [] + if file_name and write_step: + pd.DataFrame(results).to_csv(file_name, index=False, header=False, mode="a") + elif file_name: + results = pd.DataFrame(results) + results.to_csv(file_name, index=False) + return results + else: + return pd.DataFrame(results) + + +class ABExperiment(ABC): + """ + Абстрактный класс A|B эксперимента + """ + + def __init__(self, label: str): + self.label = label + + @abstractmethod + def calc_effect( + self, test_data: pd.DataFrame, control_data: pd.DataFrame, target_field: str + ) -> float: + pass + + +class ABTester: + DEFAULT_FORMAT_MAPPING = { + "rs": "random state", + "mde": "MDE", + "sample_size": "Размер выборки для тестирования", + "a_len": "Размер целевой группы", + "b_len": "Размер контрольной группы", + "a_mean": "Среднее целевой группы", + "b_mean": "Среднее контрольной группы", + } + + def __init__( + self, + splitter: ABSplitter, + target_field: str, + reliability=0.95, + power=0.8, + mde=None, + ): + """ + :param splitter: класс разделителя на A|B + :param target_field: поле с целевыми значениями + :param reliability: уровень статистической достоверности. Обычно равен 0.95 + :param power: мощность статистического критерия. Обычно равен 0.8 + :param mde: предпосчитанный mde, если None, то считается + """ + self.splitter = splitter + self.target_field = target_field + self.reliability = reliability + self.power = power + self.mde = mde + + def sampling_test( + self, + data, + experiments: Union[ABExperiment, Iterable[ABExperiment]], + random_state: int = None, + ) -> Dict: + """ + Тест на определенном разбиении + :param experiments: эксперимент или набор экспериментов, проводимых на разбиении + :random_state: seed рандома + + :return: dict с результатами теста + """ + + split = self.splitter.split_ab(data, random_state) + if isinstance(experiments, ABExperiment): + experiments = [experiments] + + mde = self.mde or calc_mde( + data.loc[split['test'], self.target_field], + data.loc[split['control'], self.target_field], + reliability=self.reliability, + power=self.power, + ) + sample_size = calc_sample_size( + data.loc[split['test'], self.target_field], + data.loc[split['control'], self.target_field], + mde, + significance=(1 - self.reliability), + power=self.power, + ) + + result = { + "rs": random_state, + "mde": mde, + "sample_size": sample_size, + "a_len": len(split['test']), + "b_len": len(split['control']), + "a_mean": data.loc[split['test'], self.target_field].mean(), + "b_mean": data.loc[split['control'], self.target_field].mean(), + } + + for e in experiments: + result[f"effect {e.label}"] = e.calc_effect( + data.loc[split['test']], data.loc[split['control']], self.target_field + ) + + return result + + def multisampling_test( + self, + data, + experiments: Union[ABExperiment, Iterable[ABExperiment]], + random_states: Iterable[int], + pbar: bool = False, + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Проводит множественные эксперименты по случайным состояниям + :param experiments: набор экспериментов, проводимых на разбиении + :param random_states: случайные состояния + :param pbar: активация прогресс бара + :return: статистики экспериментов + """ + + results = pd.DataFrame( + [ + self.sampling_test(data, experiments, rs) + for rs in tqdm(random_states, display=pbar) + ] + ) + + stats = results.describe() + stats.loc["cv %"] = (stats.loc["std"] / stats.loc["mean"] * 100).round(2) + return results, stats + + def format_stat( + self, + stat: pd.DataFrame, + experiments: Union[ABExperiment, Iterable[ABExperiment]], + rename_map: Dict = None, + ): + """ + Редактирует формат вывода статистик + + :param stat: статистики экспериментов + :param experiments: набор экспериментов, проводимых на разбиении + :param rename_map: маппинг переименования полей + + :return: форматирует датафрейм со статистиками + """ + rename_map = rename_map or self.DEFAULT_FORMAT_MAPPING + + rename_map.update( + {f"effect {e.label}": f"Эффект {e.label}" for e in experiments} + ) + + result = stat.rename(columns=rename_map) + result = result.applymap(lambda x: f"{x:,.2f}") + return result From 4f3cdb1a9023062a8c3fa1c760b80929c316ead6 Mon Sep 17 00:00:00 2001 From: 20730989 Date: Tue, 29 Aug 2023 13:14:56 +0300 Subject: [PATCH 02/20] Additional method mde calculation. --- lightautoml/addons/hypex/ab_spliter.py | 161 ++++++++++--------------- 1 file changed, 65 insertions(+), 96 deletions(-) diff --git a/lightautoml/addons/hypex/ab_spliter.py b/lightautoml/addons/hypex/ab_spliter.py index 13d94028..e01c574a 100644 --- a/lightautoml/addons/hypex/ab_spliter.py +++ b/lightautoml/addons/hypex/ab_spliter.py @@ -13,16 +13,11 @@ from scipy.stats import norm, ttest_ind, ks_2samp -def calc_mde( - test_group: pd.Series, - control_group: pd.Series, - reliability: float = 0.95, - power: float = 0.8, -) -> float: +# Методика рачета описана в Campaign Perfomance Management – Методика A/B-тестирования v.2.0 (стр.29) +def calc_mde(test_group: pd.Series, control_group: pd.Series, reliability: float = 0.95, power: float = 0.8,) -> float: """ Minimum detectable effect Минимальный эффект, который можно статистически обосновать при сравнении двух групп - Методика рачета описана в Campaign Perfomance Management – Методика A/B-тестирования v.2.0 (стр.29) :param test_group: целевая группа :param control_group: контрольная группа @@ -43,11 +38,7 @@ def calc_mde( def calc_sample_size( - test_group: pd.Series, - control_group: pd.Series, - mde, - significance: float = 0.05, - power: float = 0.8, + test_group: pd.Series, control_group: pd.Series, mde, significance: float = 0.05, power: float = 0.8, ) -> float: """ Минимально требуемуе количество объектов тестирования в общей группе @@ -60,17 +51,26 @@ def calc_sample_size( :param power: мощность статистического критерия :return: минимальный размер общей группы """ - test_std = test_group.std() - control_std = control_group.std() + if isinstance(mde, Iterable): + z_alpha = norm.ppf((2 - significance) / 2) + z_betta = norm.ppf(power) + + p1 = mde[0] + p2 = mde[1] + + return (z_alpha + z_betta) ** 2 * (p1 * (1 - p1) + p2 * (1 - p2)) / (p1 - p2) ** 2 + else: + test_std = test_group.std() + control_std = control_group.std() - test_proportion = len(test_group) / (len(test_group) + len(control_group)) - control_proportion = 1 - test_proportion + test_proportion = len(test_group) / (len(test_group) + len(control_group)) + control_proportion = 1 - test_proportion - d = ((norm.ppf(1 - significance / 2) + norm.ppf(power)) / mde) ** 2 + d = ((norm.ppf(1 - significance / 2) + norm.ppf(power)) / mde) ** 2 - s = test_std ** 2 / test_proportion + control_std ** 2 / control_proportion + s = test_std ** 2 / test_proportion + control_std ** 2 / control_proportion - return d * s + return d * s # --------------------- Classes --------------------- @@ -86,18 +86,15 @@ def __init__(self, mode="simple", by_group=None, quant_field=None): @staticmethod def merge_groups( - test_group: Union[Iterable[pd.DataFrame], pd.DataFrame], - control_group: Union[Iterable[pd.DataFrame], pd.DataFrame], + test_group: Union[Iterable[pd.DataFrame], pd.DataFrame], + control_group: Union[Iterable[pd.DataFrame], pd.DataFrame], ): """ Объединяет test и control в один df :return: объединенный датафрейм """ - if not ( - isinstance(test_group, pd.DataFrame) - and isinstance(test_group, pd.DataFrame) - ): + if not (isinstance(test_group, pd.DataFrame) and isinstance(test_group, pd.DataFrame)): test_group = pd.concat(test_group, ignore_index=True) control_group = pd.concat(control_group, ignore_index=True) @@ -106,10 +103,7 @@ def merge_groups( return pd.concat([test_group, control_group], ignore_index=True) def __simple_mode(self, data, random_state): - result = { - "test_indexes": [], - "control_indexes": [] - } + result = {"test_indexes": [], "control_indexes": []} if self.quant_field: random_ids = shuffle(data[self.quant_field].unique(), random_state=random_state) @@ -126,17 +120,16 @@ def __simple_mode(self, data, random_state): return result def split_ab(self, data, random_state: int = None) -> Dict: - result = { - "test_indexes": [], - "control_indexes": [] - } + result = {"test_indexes": [], "control_indexes": []} if self.by_group: groups = data.groupby() for _, gd in groups: if self.mode not in ("balanced", "simple"): - warnings.warn(f"Не предусмотрено режима '{self.mode}' для группового разделения. " - f"Был использован режим 'stratification'.") + warnings.warn( + f"Не предусмотрено режима '{self.mode}' для группового разделения. " + f"Был использован режим 'stratification'." + ) self.mode = "simple" if self.mode == "simple": @@ -158,8 +151,10 @@ def split_ab(self, data, random_state: int = None) -> Dict: else: if self.mode != "simple": - warnings.warn(f"Не предусмотрено режима '{self.mode}' для обычного разделения. " - f"Был использован режим 'simple'.") + warnings.warn( + f"Не предусмотрено режима '{self.mode}' для обычного разделения. " + f"Был использован режим 'simple'." + ) t_result = self.__simple_mode(data, random_state) result["test_indexes"] = t_result["test_indexes"] @@ -170,16 +165,16 @@ def split_ab(self, data, random_state: int = None) -> Dict: return result def search_dist_uniform_sampling( - self, - data, - target_fields: Union[List[str], str], - n: int = None, - random_states: Iterable[int] = None, - alpha: float = 0.05, - file_name: Union[Path, str] = None, - write_mode: str = "full", - write_step: int = 10, - pbar: bool = True, + self, + data, + target_fields: Union[List[str], str], + n: int = None, + random_states: Iterable[int] = None, + alpha: float = 0.05, + file_name: Union[Path, str] = None, + write_mode: str = "full", + write_step: int = 10, + pbar: bool = True, ) -> Optional[pd.DataFrame]: """ Подбирает random_state для поиска однородного распределения @@ -203,9 +198,7 @@ def search_dist_uniform_sampling( results = [] if write_mode not in ("full", "all", "any"): - warnings.warn( - f"Режим записи '{write_mode}' не поддерживается. Будет использован режим 'full'" - ) + warnings.warn(f"Режим записи '{write_mode}' не поддерживается. Будет использован режим 'full'") write_mode = "full" if isinstance(target_fields, str): @@ -213,9 +206,7 @@ def search_dist_uniform_sampling( for i, random_state in tqdm(enumerate(random_states), total=len(random_states), display=pbar): split = self.split_ab(data, random_state) - t_result = { - "random_state": random_state - } + t_result = {"random_state": random_state} a = data.loc[split["test_indexes"]] b = data.loc[split["control_indexes"]] scores = [] @@ -248,9 +239,7 @@ def search_dist_uniform_sampling( if i == write_step: pd.DataFrame(results).to_csv(file_name, index=False) elif i % write_step == 0: - pd.DataFrame(results).to_csv( - file_name, index=False, header=False, mode="a" - ) + pd.DataFrame(results).to_csv(file_name, index=False, header=False, mode="a") results = [] if file_name and write_step: pd.DataFrame(results).to_csv(file_name, index=False, header=False, mode="a") @@ -271,9 +260,7 @@ def __init__(self, label: str): self.label = label @abstractmethod - def calc_effect( - self, test_data: pd.DataFrame, control_data: pd.DataFrame, target_field: str - ) -> float: + def calc_effect(self, test_data: pd.DataFrame, control_data: pd.DataFrame, target_field: str) -> float: pass @@ -289,12 +276,7 @@ class ABTester: } def __init__( - self, - splitter: ABSplitter, - target_field: str, - reliability=0.95, - power=0.8, - mde=None, + self, splitter: ABSplitter, target_field: str, reliability=0.95, power=0.8, mde=None, ): """ :param splitter: класс разделителя на A|B @@ -310,10 +292,7 @@ def __init__( self.mde = mde def sampling_test( - self, - data, - experiments: Union[ABExperiment, Iterable[ABExperiment]], - random_state: int = None, + self, data, experiments: Union[ABExperiment, Iterable[ABExperiment]], random_state: int = None, ) -> Dict: """ Тест на определенном разбиении @@ -328,14 +307,14 @@ def sampling_test( experiments = [experiments] mde = self.mde or calc_mde( - data.loc[split['test'], self.target_field], - data.loc[split['control'], self.target_field], + data.loc[split["test"], self.target_field], + data.loc[split["control"], self.target_field], reliability=self.reliability, power=self.power, ) sample_size = calc_sample_size( - data.loc[split['test'], self.target_field], - data.loc[split['control'], self.target_field], + data.loc[split["test"], self.target_field], + data.loc[split["control"], self.target_field], mde, significance=(1 - self.reliability), power=self.power, @@ -345,25 +324,25 @@ def sampling_test( "rs": random_state, "mde": mde, "sample_size": sample_size, - "a_len": len(split['test']), - "b_len": len(split['control']), - "a_mean": data.loc[split['test'], self.target_field].mean(), - "b_mean": data.loc[split['control'], self.target_field].mean(), + "a_len": len(split["test"]), + "b_len": len(split["control"]), + "a_mean": data.loc[split["test"], self.target_field].mean(), + "b_mean": data.loc[split["control"], self.target_field].mean(), } for e in experiments: result[f"effect {e.label}"] = e.calc_effect( - data.loc[split['test']], data.loc[split['control']], self.target_field + data.loc[split["test"]], data.loc[split["control"]], self.target_field ) return result def multisampling_test( - self, - data, - experiments: Union[ABExperiment, Iterable[ABExperiment]], - random_states: Iterable[int], - pbar: bool = False, + self, + data, + experiments: Union[ABExperiment, Iterable[ABExperiment]], + random_states: Iterable[int], + pbar: bool = False, ) -> tuple[pd.DataFrame, pd.DataFrame]: """ Проводит множественные эксперименты по случайным состояниям @@ -373,22 +352,14 @@ def multisampling_test( :return: статистики экспериментов """ - results = pd.DataFrame( - [ - self.sampling_test(data, experiments, rs) - for rs in tqdm(random_states, display=pbar) - ] - ) + results = pd.DataFrame([self.sampling_test(data, experiments, rs) for rs in tqdm(random_states, display=pbar)]) stats = results.describe() stats.loc["cv %"] = (stats.loc["std"] / stats.loc["mean"] * 100).round(2) return results, stats def format_stat( - self, - stat: pd.DataFrame, - experiments: Union[ABExperiment, Iterable[ABExperiment]], - rename_map: Dict = None, + self, stat: pd.DataFrame, experiments: Union[ABExperiment, Iterable[ABExperiment]], rename_map: Dict = None, ): """ Редактирует формат вывода статистик @@ -401,9 +372,7 @@ def format_stat( """ rename_map = rename_map or self.DEFAULT_FORMAT_MAPPING - rename_map.update( - {f"effect {e.label}": f"Эффект {e.label}" for e in experiments} - ) + rename_map.update({f"effect {e.label}": f"Эффект {e.label}" for e in experiments}) result = stat.rename(columns=rename_map) result = result.applymap(lambda x: f"{x:,.2f}") From 05938ef75cad173c3753f5ea36c44f636c91a8f2 Mon Sep 17 00:00:00 2001 From: 20810012 Date: Mon, 4 Sep 2023 12:10:42 +0300 Subject: [PATCH 03/20] docs added --- lightautoml/addons/hypex/ab_spliter.py | 224 ++++++++++++++++--------- 1 file changed, 149 insertions(+), 75 deletions(-) diff --git a/lightautoml/addons/hypex/ab_spliter.py b/lightautoml/addons/hypex/ab_spliter.py index e01c574a..a218462c 100644 --- a/lightautoml/addons/hypex/ab_spliter.py +++ b/lightautoml/addons/hypex/ab_spliter.py @@ -15,15 +15,25 @@ # Методика рачета описана в Campaign Perfomance Management – Методика A/B-тестирования v.2.0 (стр.29) def calc_mde(test_group: pd.Series, control_group: pd.Series, reliability: float = 0.95, power: float = 0.8,) -> float: - """ - Minimum detectable effect - Минимальный эффект, который можно статистически обосновать при сравнении двух групп - - :param test_group: целевая группа - :param control_group: контрольная группа - :param reliability: уровень статистической достоверности. Обычно равен 0.95 - :param power: мощность статистического критерия - :return: минимально детерминируемый эффект + """Calculates MDE (Minimum Detectable Effect). + + MDE - minimal effect that can be statistically substantiated comparing the two groups + Calculation method is described in "Campaign Performance Management" - A/B-testing methodology v.2.0 (p.29) + + Args: + test_group: pd.Series + Target group + control_group: pd.Series + Control group + reliability: float + Level of statistical reliability, usually equals 0.95 + power: float + Statistical criterion power + + Returns: + mde: float + Minimum detectable effect + """ m = stats.norm.ppf((1 + reliability) / 2) + stats.norm.ppf(power) @@ -40,16 +50,26 @@ def calc_mde(test_group: pd.Series, control_group: pd.Series, reliability: float def calc_sample_size( test_group: pd.Series, control_group: pd.Series, mde, significance: float = 0.05, power: float = 0.8, ) -> float: - """ - Минимально требуемуе количество объектов тестирования в общей группе - Методика расчета описана в Campaign Perfomance Management – Методика A/B-тестирования v.2.0 (стр.14) - - :param test_group: целевая группа - :param control_group: контрольная группа - :param mde: минимальное детерменируемое значение - :param significance: уровень статистической значимости - вероятность ошибки первого рода (обычно 0.05) - :param power: мощность статистического критерия - :return: минимальный размер общей группы + """Calculates minimal required number of test objects for test in the general group. + + Calculation method is described in "Campaign Performance Management" - A/B-testing methodology v.2.0 (p.14) + + Args: + test_group: pd.Series + Target group + control_group: pd.Series + Control group + mde: float + Minimal detectable effect + significance: float + Statistical significance level - type I error probability (usually 0.05) + power: float + Statistical criterion power + + Returns: + min_sample_size: float + Minimal size of the general group + """ if isinstance(mde, Iterable): z_alpha = norm.ppf((2 - significance) / 2) @@ -75,9 +95,7 @@ def calc_sample_size( # --------------------- Classes --------------------- class ABSplitter: - """ - Класс разделителя на A|B группы - """ + """Abstract class - divider on A and B groups.""" def __init__(self, mode="simple", by_group=None, quant_field=None): self.mode = mode @@ -89,10 +107,18 @@ def merge_groups( test_group: Union[Iterable[pd.DataFrame], pd.DataFrame], control_group: Union[Iterable[pd.DataFrame], pd.DataFrame], ): - """ - Объединяет test и control в один df + """Merges test and control groups in one DataFrame. + + Args: + test_group: pd.DataFrame + Data of target group + control_group: pd.DataFrame + Data of control group + + Returns: + merged_data: pd.DataFrame + Concatted DataFrame - :return: объединенный датафрейм """ if not (isinstance(test_group, pd.DataFrame) and isinstance(test_group, pd.DataFrame)): test_group = pd.concat(test_group, ignore_index=True) @@ -127,8 +153,8 @@ def split_ab(self, data, random_state: int = None) -> Dict: for _, gd in groups: if self.mode not in ("balanced", "simple"): warnings.warn( - f"Не предусмотрено режима '{self.mode}' для группового разделения. " - f"Был использован режим 'stratification'." + f"The mode '{self.mode}' is not supported for group division. " + f"Implemented mode 'stratification'." ) self.mode = "simple" @@ -152,8 +178,8 @@ def split_ab(self, data, random_state: int = None) -> Dict: else: if self.mode != "simple": warnings.warn( - f"Не предусмотрено режима '{self.mode}' для обычного разделения. " - f"Был использован режим 'simple'." + f"The mode '{self.mode}' is not supported for regular division. " + f"Implemented mode 'simple'." ) t_result = self.__simple_mode(data, random_state) @@ -176,21 +202,36 @@ def search_dist_uniform_sampling( write_step: int = 10, pbar: bool = True, ) -> Optional[pd.DataFrame]: - """ - Подбирает random_state для поиска однородного распределения - - :param target_field: поле с целевым значением - :param n: количество итераций поиска - :param random_states: случайные состояния по которым проводится поиск (альтернатива n, если введено, то n игнорируется) - :param alpha: порог для проверки статистических гипотез - :param file_name: имя файла, в котором будет сохраняться результат (если не заполнено, функция вернет результат, а не будет сохранять его в файл) - :param write_mode: режим записи. Поддерживаются следующие: - 'full' - записывает все эксперименты - 'all' - записывает те эксперименты, которые прошли все статистические тесты - 'any' - записывает те эксперименты, которые прошли любой из статистических тестов - :param write_step: шаг записи экспериментов в файл (если не указано, пошаговая запись не используется) - :param pbar: отображать ли progress bar - :return: DataFrame, если не использовалась пошаговая запись, иначе None + """Chooses random_state for finding homogeneous distribution. + + Args: + data: pd.DataFrame + Input data + target_fields: str or list[str] + Field with target value + n: int + Number of searching iterations + random_states: Iterable + Random states from searching (if given, n is ignoring) + alpha: float, default = 0.05 + Threshold to check statistical hypothesis; usually 0.05 + file_name: str or Path + Name of file to save results (if None - no results will be saved, + func returns result) + write_mode: str, default = 'full' + Mode to write: + 'full' - save all experiments + 'all' - save experiments that passed all statistical tests + 'any' - save experiments that passed any statistical test + write_step: int, default = 10 + Step to write experiments to file + pbar: bool, default = True + Flag to show progress bar + + Returns: + results: pd.DataFrame or None + If no saving (no file_name, no write mode and no write_step) returns dataframe + else None and saves file to csv """ if random_states is None and n: random_states = range(n) @@ -198,7 +239,9 @@ def search_dist_uniform_sampling( results = [] if write_mode not in ("full", "all", "any"): - warnings.warn(f"Режим записи '{write_mode}' не поддерживается. Будет использован режим 'full'") + warnings.warn( + f"Write mode '{write_mode}' is not supported. Mode 'full' will be used" + ) write_mode = "full" if isinstance(target_fields, str): @@ -252,9 +295,7 @@ def search_dist_uniform_sampling( class ABExperiment(ABC): - """ - Абстрактный класс A|B эксперимента - """ + """Abstract class of A/B experiment.""" def __init__(self, label: str): self.label = label @@ -268,22 +309,29 @@ class ABTester: DEFAULT_FORMAT_MAPPING = { "rs": "random state", "mde": "MDE", - "sample_size": "Размер выборки для тестирования", - "a_len": "Размер целевой группы", - "b_len": "Размер контрольной группы", - "a_mean": "Среднее целевой группы", - "b_mean": "Среднее контрольной группы", + "sample_size": "Size of test sample", + "a_len": "Size of target group", + "b_len": "Size of control group", + "a_mean": "Mean of target group", + "b_mean": "Mean of control group", } def __init__( self, splitter: ABSplitter, target_field: str, reliability=0.95, power=0.8, mde=None, ): """ - :param splitter: класс разделителя на A|B - :param target_field: поле с целевыми значениями - :param reliability: уровень статистической достоверности. Обычно равен 0.95 - :param power: мощность статистического критерия. Обычно равен 0.8 - :param mde: предпосчитанный mde, если None, то считается + Args: + splitter: ABSplitter + Class of divider on A and B groups + target_field: str + Field with target values + reliability: float, default = 0.95 + Level of statistical reliability, usually equals 0.95 + power: float, default = 0.8 + Statistical criterion power, usually equals 0.8 + mde: + Calculated mde (minimal detected effect), + if none - calculates inside """ self.splitter = splitter self.target_field = target_field @@ -294,12 +342,20 @@ def __init__( def sampling_test( self, data, experiments: Union[ABExperiment, Iterable[ABExperiment]], random_state: int = None, ) -> Dict: - """ - Тест на определенном разбиении - :param experiments: эксперимент или набор экспериментов, проводимых на разбиении - :random_state: seed рандома + """Test on specific sample + + Args: + data: pd.DataFrame + Input data + experiments: + Experiment or set of experiments applied on sample + random_state: int + Seed of random + + Returns: + result: dict + Test results - :return: dict с результатами теста """ split = self.splitter.split_ab(data, random_state) @@ -344,12 +400,24 @@ def multisampling_test( random_states: Iterable[int], pbar: bool = False, ) -> tuple[pd.DataFrame, pd.DataFrame]: - """ - Проводит множественные эксперименты по случайным состояниям - :param experiments: набор экспериментов, проводимых на разбиении - :param random_states: случайные состояния - :param pbar: активация прогресс бара - :return: статистики экспериментов + """Implements multiple experiments on random states. + + Args: + data: pd.DataFrame + Input data + experiments: + Set of experiments applied on sample + random_states: Iterable[int] + Random_states + pbar: bool, default = False + Flag to show progress bar + + Returns: + results: pd.DataFrame + Experiment test results + stats: pd.DataFrame + Description statistics + """ results = pd.DataFrame([self.sampling_test(data, experiments, rs) for rs in tqdm(random_states, display=pbar)]) @@ -361,14 +429,20 @@ def multisampling_test( def format_stat( self, stat: pd.DataFrame, experiments: Union[ABExperiment, Iterable[ABExperiment]], rename_map: Dict = None, ): - """ - Редактирует формат вывода статистик + """Corrects format of output statistics + + Args: + stat: pd.DataFrame + Experiment statistics + experiments: + Set of experiments applied on sample + rename_map: dict + Mapping of renaming fields - :param stat: статистики экспериментов - :param experiments: набор экспериментов, проводимых на разбиении - :param rename_map: маппинг переименования полей + Returns: + result: pd.DataFrame + Formatted values - :return: форматирует датафрейм со статистиками """ rename_map = rename_map or self.DEFAULT_FORMAT_MAPPING From ef696c27da9c5b952fbc9ea9b36bc143434caafd Mon Sep 17 00:00:00 2001 From: 20810012 Date: Tue, 5 Sep 2023 15:21:27 +0300 Subject: [PATCH 04/20] corrected --- lightautoml/addons/hypex/ab_spliter.py | 165 ++++++++++++------------- 1 file changed, 77 insertions(+), 88 deletions(-) diff --git a/lightautoml/addons/hypex/ab_spliter.py b/lightautoml/addons/hypex/ab_spliter.py index a218462c..643cd583 100644 --- a/lightautoml/addons/hypex/ab_spliter.py +++ b/lightautoml/addons/hypex/ab_spliter.py @@ -21,19 +21,13 @@ def calc_mde(test_group: pd.Series, control_group: pd.Series, reliability: float Calculation method is described in "Campaign Performance Management" - A/B-testing methodology v.2.0 (p.29) Args: - test_group: pd.Series - Target group - control_group: pd.Series - Control group - reliability: float - Level of statistical reliability, usually equals 0.95 - power: float - Statistical criterion power + test_group: Target group + control_group: Control group + reliability: Level of statistical reliability, usually equals 0.95 + power: Statistical criterion power Returns: - mde: float - Minimum detectable effect - + mde: Minimum detectable effect """ m = stats.norm.ppf((1 + reliability) / 2) + stats.norm.ppf(power) @@ -44,32 +38,27 @@ def calc_mde(test_group: pd.Series, control_group: pd.Series, reliability: float var_test, var_control = test_group.var(ddof=1), control_group.var(ddof=1) s = np.sqrt((var_test / n_test) + (var_control / n_control)) - return m * p * s + mde = m * p * s + + return mde def calc_sample_size( - test_group: pd.Series, control_group: pd.Series, mde, significance: float = 0.05, power: float = 0.8, + test_group: pd.Series, control_group: pd.Series, mde: Union[Iterable[float], float], significance: float = 0.05, power: float = 0.8, ) -> float: """Calculates minimal required number of test objects for test in the general group. Calculation method is described in "Campaign Performance Management" - A/B-testing methodology v.2.0 (p.14) Args: - test_group: pd.Series - Target group - control_group: pd.Series - Control group - mde: float - Minimal detectable effect - significance: float - Statistical significance level - type I error probability (usually 0.05) - power: float - Statistical criterion power + test_group: Target group + control_group: Control group + mde: Minimal detectable effect + significance: Statistical significance level - type I error probability (usually 0.05) + power: Statistical criterion power Returns: - min_sample_size: float - Minimal size of the general group - + min_sample_size: Minimal size of the general group """ if isinstance(mde, Iterable): z_alpha = norm.ppf((2 - significance) / 2) @@ -78,7 +67,7 @@ def calc_sample_size( p1 = mde[0] p2 = mde[1] - return (z_alpha + z_betta) ** 2 * (p1 * (1 - p1) + p2 * (1 - p2)) / (p1 - p2) ** 2 + min_sample_size = (z_alpha + z_betta) ** 2 * (p1 * (1 - p1) + p2 * (1 - p2)) / (p1 - p2) ** 2 else: test_std = test_group.std() control_std = control_group.std() @@ -90,7 +79,9 @@ def calc_sample_size( s = test_std ** 2 / test_proportion + control_std ** 2 / control_proportion - return d * s + min_sample_size = d * s + + return min_sample_size # --------------------- Classes --------------------- @@ -110,15 +101,11 @@ def merge_groups( """Merges test and control groups in one DataFrame. Args: - test_group: pd.DataFrame - Data of target group - control_group: pd.DataFrame - Data of control group + test_group: Data of target group + control_group: Data of control group Returns: - merged_data: pd.DataFrame - Concatted DataFrame - + merged_data: Concatted DataFrame """ if not (isinstance(test_group, pd.DataFrame) and isinstance(test_group, pd.DataFrame)): test_group = pd.concat(test_group, ignore_index=True) @@ -126,9 +113,12 @@ def merge_groups( test_group.loc[:, "group"] = "test" control_group.loc[:, "group"] = "control" - return pd.concat([test_group, control_group], ignore_index=True) - def __simple_mode(self, data, random_state): + merged_data = pd.concat([test_group, control_group], ignore_index=True) + + return merged_data + + def __simple_mode(self, data: pd.DataFrame, random_state: int = None): result = {"test_indexes": [], "control_indexes": []} if self.quant_field: @@ -145,7 +135,16 @@ def __simple_mode(self, data, random_state): return result - def split_ab(self, data, random_state: int = None) -> Dict: + def split_ab(self, data: pd.DataFrame, random_state: int = None) -> Dict: + """Divides sample on two groups. + + Args: + data: input data + random_state: one integer to fix split + + Returns: + result: dict of indexes with division on test and control group + """ result = {"test_indexes": [], "control_indexes": []} if self.by_group: @@ -188,11 +187,12 @@ def split_ab(self, data, random_state: int = None) -> Dict: result["test_indexes"] = list(set(result["test_indexes"])) result["control_indexes"] = list(set(result["test_indexes"])) + return result def search_dist_uniform_sampling( self, - data, + data: pd.DataFrame, target_fields: Union[List[str], str], n: int = None, random_states: Iterable[int] = None, @@ -205,31 +205,30 @@ def search_dist_uniform_sampling( """Chooses random_state for finding homogeneous distribution. Args: - data: pd.DataFrame + data: Input data - target_fields: str or list[str] + target_fields: Field with target value - n: int + n: Number of searching iterations - random_states: Iterable + random_states: Random states from searching (if given, n is ignoring) - alpha: float, default = 0.05 + alpha: Threshold to check statistical hypothesis; usually 0.05 - file_name: str or Path - Name of file to save results (if None - no results will be saved, - func returns result) - write_mode: str, default = 'full' + file_name: + Name of file to save results (if None - no results will be saved, func returns result) + write_mode: Mode to write: 'full' - save all experiments 'all' - save experiments that passed all statistical tests 'any' - save experiments that passed any statistical test - write_step: int, default = 10 + write_step: Step to write experiments to file - pbar: bool, default = True + pbar: Flag to show progress bar Returns: - results: pd.DataFrame or None + results: If no saving (no file_name, no write mode and no write_step) returns dataframe else None and saves file to csv """ @@ -284,6 +283,7 @@ def search_dist_uniform_sampling( elif i % write_step == 0: pd.DataFrame(results).to_csv(file_name, index=False, header=False, mode="a") results = [] + if file_name and write_step: pd.DataFrame(results).to_csv(file_name, index=False, header=False, mode="a") elif file_name: @@ -321,13 +321,13 @@ def __init__( ): """ Args: - splitter: ABSplitter + splitter: Class of divider on A and B groups - target_field: str + target_field: Field with target values - reliability: float, default = 0.95 + reliability: Level of statistical reliability, usually equals 0.95 - power: float, default = 0.8 + power: Statistical criterion power, usually equals 0.8 mde: Calculated mde (minimal detected effect), @@ -340,22 +340,17 @@ def __init__( self.mde = mde def sampling_test( - self, data, experiments: Union[ABExperiment, Iterable[ABExperiment]], random_state: int = None, + self, data: pd.DataFrame, experiments: Union[ABExperiment, Iterable[ABExperiment]], random_state: int = None, ) -> Dict: - """Test on specific sample + """Test on specific sample. Args: - data: pd.DataFrame - Input data - experiments: - Experiment or set of experiments applied on sample - random_state: int - Seed of random + data: Input data + experiments: Experiment or set of experiments applied on sample + random_state: Seed of random Returns: - result: dict - Test results - + result: Test results """ split = self.splitter.split_ab(data, random_state) @@ -395,7 +390,7 @@ def sampling_test( def multisampling_test( self, - data, + data: pd.DataFrame, experiments: Union[ABExperiment, Iterable[ABExperiment]], random_states: Iterable[int], pbar: bool = False, @@ -403,50 +398,44 @@ def multisampling_test( """Implements multiple experiments on random states. Args: - data: pd.DataFrame + data: Input data experiments: Set of experiments applied on sample - random_states: Iterable[int] - Random_states - pbar: bool, default = False + random_states: + Seeds of random + pbar: Flag to show progress bar Returns: - results: pd.DataFrame + results: Experiment test results - stats: pd.DataFrame + statistics: Description statistics - """ results = pd.DataFrame([self.sampling_test(data, experiments, rs) for rs in tqdm(random_states, display=pbar)]) - stats = results.describe() - stats.loc["cv %"] = (stats.loc["std"] / stats.loc["mean"] * 100).round(2) - return results, stats + statistics = results.describe() + statistics.loc["cv %"] = (statistics.loc["std"] / statistics.loc["mean"] * 100).round(2) + return results, statistics def format_stat( self, stat: pd.DataFrame, experiments: Union[ABExperiment, Iterable[ABExperiment]], rename_map: Dict = None, ): - """Corrects format of output statistics + """Corrects format of output statistics. Args: - stat: pd.DataFrame - Experiment statistics - experiments: - Set of experiments applied on sample - rename_map: dict - Mapping of renaming fields + stat: Experiment statistics + experiments: Set of experiments applied on sample + rename_map: Mapping of renaming fields Returns: - result: pd.DataFrame - Formatted values - + result: Formatted values """ rename_map = rename_map or self.DEFAULT_FORMAT_MAPPING - rename_map.update({f"effect {e.label}": f"Эффект {e.label}" for e in experiments}) + rename_map.update({f"effect {e.label}": f"Effect {e.label}" for e in experiments}) result = stat.rename(columns=rename_map) result = result.applymap(lambda x: f"{x:,.2f}") From cf2cd5a40bfd798e670781f70cc8a075f763daae Mon Sep 17 00:00:00 2001 From: 20810012 Date: Wed, 6 Sep 2023 14:11:20 +0300 Subject: [PATCH 05/20] patly corrected - will be updated --- lightautoml/addons/hypex/ab_spliter.py | 31 +++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/lightautoml/addons/hypex/ab_spliter.py b/lightautoml/addons/hypex/ab_spliter.py index 643cd583..d3533b29 100644 --- a/lightautoml/addons/hypex/ab_spliter.py +++ b/lightautoml/addons/hypex/ab_spliter.py @@ -88,7 +88,19 @@ def calc_sample_size( class ABSplitter: """Abstract class - divider on A and B groups.""" - def __init__(self, mode="simple", by_group=None, quant_field=None): + def __init__(self, mode: str = "simple", by_group: Union[str, Iterable[str]] = None, quant_field: str = None): + """ + + Args: + mode: + Regime to divide sample on A and B samples: + 'simple' - divides by groups, placing equal amount of records (clients | groups of clients) in samples + 'balanced' - divides by size of samples, placing full groups depending on the size of group to balance size of A and B. Can not be applied without groups + by_group: + Name of field(s) for division by groups + quant_field: + Name of field by which division should take in account common features besides groups + """ self.mode = mode self.by_group = by_group self.quant_field = quant_field @@ -119,6 +131,17 @@ def merge_groups( return merged_data def __simple_mode(self, data: pd.DataFrame, random_state: int = None): + """Separates data on A and B samples. + + Separation performed to divide groups of equal sizes - equal amount of records or equal amount of groups in each sample + + Args: + data: Input data + random_state: Seed of random + + Returns: + result: Test and control samples of indexes dictionary + """ result = {"test_indexes": [], "control_indexes": []} if self.quant_field: @@ -153,7 +176,7 @@ def split_ab(self, data: pd.DataFrame, random_state: int = None) -> Dict: if self.mode not in ("balanced", "simple"): warnings.warn( f"The mode '{self.mode}' is not supported for group division. " - f"Implemented mode 'stratification'." + f"Implemented mode 'simple'." ) self.mode = "simple" @@ -306,6 +329,8 @@ def calc_effect(self, test_data: pd.DataFrame, control_data: pd.DataFrame, targe class ABTester: + """Separates data to homogeneous groups and calculate metrics - sizes of groups and MDE.""" + DEFAULT_FORMAT_MAPPING = { "rs": "random state", "mde": "MDE", @@ -317,7 +342,7 @@ class ABTester: } def __init__( - self, splitter: ABSplitter, target_field: str, reliability=0.95, power=0.8, mde=None, + self, splitter: ABSplitter, target_field: str, reliability: float = 0.95, power: float = 0.8, mde: float = None, ): """ Args: From 51e76302049d0d19d38f4439c1717df37248daa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=93=D0=BB=D0=B0=D0=B7=D0=BE=D0=B2=D0=B0=20=D0=AF=D0=BD?= =?UTF-8?q?=D0=B0=20=D0=90=D0=BB=D0=B5=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= =?UTF-8?q?=D0=BE=D0=B2=D0=BD=D0=B0?= Date: Wed, 20 Sep 2023 14:26:38 +0300 Subject: [PATCH 06/20] bug fix --- lightautoml/addons/hypex/algorithms/faiss_matcher.py | 4 ++-- lightautoml/addons/hypex/matcher.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lightautoml/addons/hypex/algorithms/faiss_matcher.py b/lightautoml/addons/hypex/algorithms/faiss_matcher.py index bcc207ea..95937e23 100644 --- a/lightautoml/addons/hypex/algorithms/faiss_matcher.py +++ b/lightautoml/addons/hypex/algorithms/faiss_matcher.py @@ -491,8 +491,8 @@ def matching_quality(self, df_matched) -> Dict[str, Union[Dict[str, float], floa else: logger.info("Estimating quality of matching") - psi_columns = self.columns_match - psi_columns.remove(self.treatment) + psi_columns = set(self.columns_match) + psi_columns = list(psi_columns - set([self.treatment] + self.outcomes)) psi_data, ks_data, smd_data = matching_quality( df_matched, self.treatment, sorted(self.features_quality), sorted(psi_columns), self.silent ) diff --git a/lightautoml/addons/hypex/matcher.py b/lightautoml/addons/hypex/matcher.py index 52feb909..f792eb2f 100644 --- a/lightautoml/addons/hypex/matcher.py +++ b/lightautoml/addons/hypex/matcher.py @@ -222,7 +222,7 @@ def _preprocessing_data(self): """Converts categorical features into dummy variables.""" info_col = self.info_col if self.info_col is not None else [] group_col = [self.group_col] if self.group_col is not None else [] - columns_to_drop = info_col + group_col + self.outcomes + columns_to_drop = info_col + group_col + self.outcomes + [self.treatment] if self.base_filtration: filtered_features = nan_filtration(self.input_data.drop(columns=columns_to_drop)) self.dropped_features = [f for f in self.input_data.columns if f not in filtered_features + columns_to_drop] From 0f45053ecb272306c48c664056bbb7ef4eb384f8 Mon Sep 17 00:00:00 2001 From: 20810012 Date: Wed, 20 Sep 2023 15:18:53 +0300 Subject: [PATCH 07/20] will be refactored and metrics added --- lightautoml/addons/hypex/ab_spliter.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lightautoml/addons/hypex/ab_spliter.py b/lightautoml/addons/hypex/ab_spliter.py index d3533b29..88e4c631 100644 --- a/lightautoml/addons/hypex/ab_spliter.py +++ b/lightautoml/addons/hypex/ab_spliter.py @@ -406,7 +406,14 @@ def sampling_test( "b_mean": data.loc[split["control"], self.target_field].mean(), } - for e in experiments: + for e in experiments: # как считается эффект написано в эксперименте, перенести в calc_effect + """ + сделать разницу средних в наследнике класса (новый класс создать) + на альфе в к7м ABTesting, IncraceExperiment + передается эксперимент, (надо встроить эксперимент сюда) + целевая картинка - передать данные и получить результат + сейчас надо вшить эксперимент из ноутбука сюда + """ result[f"effect {e.label}"] = e.calc_effect( data.loc[split["test"]], data.loc[split["control"]], self.target_field ) From d806abe213aed341afb32251e5887904283d4d6f Mon Sep 17 00:00:00 2001 From: 20810012 Date: Mon, 25 Sep 2023 11:08:29 +0300 Subject: [PATCH 08/20] AA-test done - will be done pytest. Tutorial with groups and quants --- .../addons/hypex/ABTesting/__init__.py | 0 .../addons/hypex/ABTesting/ab_tester.py | 432 ++++++++++++++++++ .../addons/hypex/ABTesting/tests_abtester.py | 7 + .../hypex/utils/tutorial_data_creation.py | 121 ++++- 4 files changed, 541 insertions(+), 19 deletions(-) create mode 100644 lightautoml/addons/hypex/ABTesting/__init__.py create mode 100644 lightautoml/addons/hypex/ABTesting/ab_tester.py create mode 100644 lightautoml/addons/hypex/ABTesting/tests_abtester.py diff --git a/lightautoml/addons/hypex/ABTesting/__init__.py b/lightautoml/addons/hypex/ABTesting/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lightautoml/addons/hypex/ABTesting/ab_tester.py b/lightautoml/addons/hypex/ABTesting/ab_tester.py new file mode 100644 index 00000000..bd241583 --- /dev/null +++ b/lightautoml/addons/hypex/ABTesting/ab_tester.py @@ -0,0 +1,432 @@ +import warnings +from abc import ABC, abstractmethod +from pathlib import Path +from sklearn.utils import shuffle +from typing import Iterable, Union, Optional, List, Dict, Any + +from tqdm.auto import tqdm + +import pandas as pd +import numpy as np +import scipy.stats as stats +from scipy.stats import norm, ttest_ind, ks_2samp + +RANDOM_STATE = 52 + + +def merge_groups( + test_group: Union[Iterable[pd.DataFrame], pd.DataFrame], + control_group: Union[Iterable[pd.DataFrame], pd.DataFrame], +): + """Merges test and control groups in one DataFrame and creates column "group". + + Column "group" contains of "test" and "control" values. + + Args: + test_group: Data of target group + control_group: Data of control group + + Returns: + merged_data: Concatted DataFrame + """ + # if not isinstance(test_group, pd.DataFrame): + # test_group = pd.concat(test_group, ignore_index=True) + # if not isinstance(control_group, pd.DataFrame): + # control_group = pd.concat(control_group, ignore_index=True) + + test_group.loc[:, "group"] = "test" + control_group.loc[:, "group"] = "control" + + merged_data = pd.concat([test_group, control_group], ignore_index=True) + + return merged_data + + +class AATest: + def __init__( + self, + data: pd.DataFrame, + target_fields: Union[Iterable[str], str], + info_cols: Union[Iterable[str], str] = None, + mode: str = "simple", + by_group: Union[str, Iterable[str]] = None, + quant_field: str = None + ): + """ + + Args: + data: + Input data + target_fields: + Field with target value + mode: + Regime to divide sample on A and B samples: + 'simple' - divides by groups, placing equal amount of records (clients | groups of clients) in samples + 'balanced' - divides by size of samples, placing full groups depending on the size of group to balance size of A and B. Can not be applied without groups + by_group: + Name of field(s) for division by groups + quant_field: + Name of field by which division should take in account common features besides groups + """ + self.data = data + self.init_data = data + self.target_fields = [target_fields] if isinstance(target_fields, str) else target_fields + self.info_cols = [info_cols] if isinstance(info_cols, str) else info_cols + self.mode = mode + self.by_group = by_group + self.quant_field = quant_field + self._preprocessing_data() + + def _preprocessing_data(self): + """Converts categorical variables to dummy variables. + + Returns: + Data with categorical variables converted to dummy variables. + """ + data = self.data + + # categorical to dummies + init_cols = data.columns + data = pd.get_dummies(data, dummy_na=True) + + # fix if dummy_na is const=0 + dummies_cols = [col for col in data.columns if col not in init_cols] # choose only dummy columns + const_columns = [col for col in dummies_cols if data[col].sum() == 0] # choose constant_columns + cols_to_drop = const_columns + (self.info_cols if self.info_cols is not None else []) + self.data = data.drop(columns=cols_to_drop) + + def __simple_mode(self, data: pd.DataFrame, random_state: int = RANDOM_STATE): + """Separates data on A and B samples. + + Separation performed to divide groups of equal sizes - equal amount of records or equal amount of groups in each sample. + + Args: + data: Input data + random_state: Seed of random + + Returns: + result: Test and control samples of indexes dictionary + """ + result = {"test_indexes": [], "control_indexes": []} + + if self.quant_field: + random_ids = shuffle(data[self.quant_field].unique(), random_state=random_state) + edge = len(random_ids) // 2 + result["test_indexes"] = list(data[data[self.quant_field].isin(random_ids[:edge])].index) + result["control_indexes"] = list(data[data[self.quant_field].isin(random_ids[edge:])].index) + + else: + addition_indexes = list(shuffle(data.index, random_state=random_state)) + edge = len(addition_indexes) // 2 + result["test_indexes"] = addition_indexes[:edge] + result["control_indexes"] = addition_indexes[edge:] + + return result + + def split_ab(self, random_state: int = RANDOM_STATE) -> Dict: + """Divides sample on two groups. + + Args: + random_state: one integer to fix split + + Returns: + result: dict of indexes with division on test and control group + """ + data = self.data + result = {"test_indexes": [], "control_indexes": []} + + if self.by_group: + groups = data.groupby() + for _, gd in groups: + if self.mode not in ("balanced", "simple"): + warnings.warn( + f"The mode '{self.mode}' is not supported for group division. " + f"Implemented mode 'simple'." + ) + self.mode = "simple" + + if self.mode == "simple": + t_result = self.__simple_mode(gd, random_state) + result["test_indexes"] += t_result["test_indexes"] + result["control_indexes"] += t_result["control_indexes"] + + elif self.mode == "balanced": + if self.quant_field: + random_ids = shuffle(gd[self.quant_field].unique(), random_state=random_state) + addition_indexes = list(gd[gd[self.quant_field].isin(random_ids)].index) + else: + addition_indexes = list(shuffle(gd.index, random_state=random_state)) + + if len(result["control_indexes"]) > len(result["test_indexes"]): + result["test_indexes"] += addition_indexes + else: + result["control_indexes"] += addition_indexes + + else: + if self.mode != "simple": + warnings.warn( + f"The mode '{self.mode}' is not supported for regular division. " + f"Implemented mode 'simple'." + ) + + t_result = self.__simple_mode(data, random_state) + result["test_indexes"] = t_result["test_indexes"] + result["control_indexes"] = t_result["control_indexes"] + + result["test_indexes"] = list(set(result["test_indexes"])) + result["control_indexes"] = list(set(result["test_indexes"])) + + return result + + def _postprep_data(self, spit_indexes: Dict = None): + """prep data to show user (add info_cols and decode binary variables) + + Args: + spit_indexes: dict of indexes with separation on test and control group + + Returns: + data: separated init data with column "group" + """ + # prep data to show user (add info_cols and decode binary variables) + test = self.init_data.loc[spit_indexes['test_indexes']] + control = self.init_data.loc[spit_indexes['control_indexes']] + data = merge_groups(test, control) + + return data + + def sampling_metrics( + self, + alpha: float = 0.05, + random_state: int = RANDOM_STATE + ): + """ + + Args: + alpha: Threshold to check statistical hypothesis; usually 0.05 + random_state: Random seeds for searching + + Returns: + result: tuple of + 1) metrics dataframe (stat tests) and + 2) dict of random state with test_control dataframe + """ + data_from_sampling_dict = {} + scores = [] + t_result = {"random_state": random_state} + + split = self.split_ab(random_state) + a = self.data.loc[split['test_indexes']] + b = self.data.loc[split['control_indexes']] + + # prep data to show user (add info_cols and decode binary variables) + data_from_sampling_dict[random_state] = self._postprep_data(split) + + for tf in self.target_fields: + ta = a[tf] + tb = b[tf] + + t_result[f"{tf} a mean"] = ta.mean() + t_result[f"{tf} b mean"] = tb.mean() + t_result[f"{tf} ab delta %"] = (1 - t_result[f"{tf} a mean"] / t_result[f"{tf} b mean"]) * 100 + t_result[f"{tf} t_test p_value"] = ttest_ind(ta, tb).pvalue + t_result[f"{tf} ks_test p_value"] = ks_2samp(ta, tb).pvalue + t_result[f"{tf} t_test passed"] = t_result[f"{tf} t_test p_value"] > alpha + t_result[f"{tf} ks_test passed"] = t_result[f"{tf} ks_test p_value"] > alpha + scores.append((t_result[f"{tf} t_test p_value"] + t_result[f"{tf} ks_test p_value"]) / 2) + + t_result["mean_tests_score"] = np.mean(scores) + result = { + 'metrics': t_result, + 'data_from_experiment': data_from_sampling_dict + } + + return result + + def search_dist_uniform_sampling( + self, + alpha: float = 0.05, + iterations: int = 10, + file_name: Union[Path, str] = None, + write_mode: str = "full", + write_step: int = None, + pbar: bool = True + ) -> Optional[tuple[pd.DataFrame, dict[Any, dict]]]: + """Chooses random_state for finding homogeneous distribution. + + Args: + iterations: + Number of iterations to search uniform sampling to searching + alpha: + Threshold to check statistical hypothesis; usually 0.05 + file_name: + Name of file to save results (if None - no results will be saved, func returns result) + write_mode: + Mode to write: + 'full' - save all experiments + 'all' - save experiments that passed all statistical tests + 'any' - save experiments that passed any statistical test + write_step: + Step to write experiments to file + pbar: + Flag to show progress bar + + Returns: + results: + If no saving (no file_name, no write mode and no write_step) returns dataframe + else None and saves file to csv + """ + random_states = range(iterations) + results = [] + data_from_sampling = {} + + if write_mode not in ("full", "all", "any"): + warnings.warn( + f"Write mode '{write_mode}' is not supported. Mode 'full' will be used" + ) + write_mode = "full" + + for i, rs in tqdm(enumerate(random_states), total=len(random_states), display=pbar): + res = self.sampling_metrics(alpha=alpha, random_state=rs) + data_from_sampling.update(res['data_from_experiment']) + + # write to file + passed = [] + for tf in self.target_fields: + passed += [res['metrics'][f"{tf} t_test passed"], res['metrics'][f"{tf} ks_test passed"]] + + if write_mode == "all" and all(passed): + results.append(res['metrics']) + if write_mode == "any" and any(passed): + results.append(res['metrics']) + if write_mode == "full": + results.append(res['metrics']) + + if file_name and write_step: + if i == write_step: + pd.DataFrame(results).to_csv(file_name, index=False) + elif i % write_step == 0: + pd.DataFrame(results).to_csv(file_name, index=False, header=False, mode="a") + results = [] + + if file_name and write_step: + pd.DataFrame(results).to_csv(file_name, index=False, header=False, mode="a") + elif file_name: + results = pd.DataFrame(results) + results.to_csv(file_name, index=False) + return results, data_from_sampling + else: + return pd.DataFrame(results), data_from_sampling + +# class ABTest: +# """Calculates metrics - MDE, ATE and p_value.""" +# +# DEFAULT_FORMAT_MAPPING = { +# "rs": "random state", +# "mde": "MDE", +# "sample_size": "Size of test sample", +# "a_len": "Size of target group", +# "b_len": "Size of control group", +# "a_mean": "Mean of target group", +# "b_mean": "Mean of control group", +# } +# +# def __init__( +# self, target_field: str, reliability: float = 0.95, power: float = 0.8, mde: float = None, +# ): +# """ +# Args: +# splitter: +# Class of divider on A and B groups +# target_field: +# Field with target values +# reliability: +# Level of statistical reliability, usually equals 0.95 +# power: +# Statistical criterion power, usually equals 0.8 +# mde: +# Calculated mde (minimal detected effect), +# if none - calculates inside +# """ +# self.test = test_data +# self.control = control_data +# self.target_field = target_field +# self.reliability = reliability +# self.power = power +# self.mde = mde +# +# def sampling_test( +# self, data: pd.DataFrame, experiments = None, random_state: int = None, +# ) -> Dict: +# """Test on specific sample. +# +# Args: +# data: Input data +# experiments: Experiment or set of experiments applied on sample +# random_state: Seed of random +# +# Returns: +# result: Test results +# """ +# +# # split = self.splitter.split_ab(data, random_state) +# # if isinstance(experiments, ABExperiment): +# # experiments = [experiments] +# +# mde = self.mde or calc_mde( +# data.loc[self.test, self.target_field], +# data.loc[self.control, self.target_field], +# reliability=self.reliability, +# power=self.power, +# ) +# sample_size = calc_sample_size( +# data.loc[self.test, self.target_field], +# data.loc[self.control, self.target_field], +# mde, +# significance=(1 - self.reliability), +# power=self.power, +# ) +# +# result = { +# "rs": random_state, +# "mde": mde, +# "sample_size": sample_size, +# "a_len": len(self.test), +# "b_len": len(self.control), +# "a_mean": data.loc[self.test, self.target_field].mean(), +# "b_mean": data.loc[self.control, self.target_field].mean(), +# } +# # включить класс из ноута +# for e in experiments: #: как считается эффект написано в эксперименте, перенести в calc_effect +# """ +# сделать разницу средних в наследнике класса (новый класс создать) +# на альфе в к7м ABTesting, IncraceExperiment +# передается эксперимент, (надо встроить эксперимент сюда) +# целевая картинка - передать данные и получить результат +# сейчас надо вшить эксперимент из ноутбука сюда +# """ +# result[f"effect {e.label}"] = e.calc_effect( +# data.loc[split["test"]], data.loc[split["control"]], self.target_field +# ) +# +# return result +# +# def format_stat( +# self, stat: pd.DataFrame, experiments = None, rename_map: Dict = None, +# ): +# """Corrects format of output statistics. +# +# Args: +# stat: Experiment statistics +# experiments: Set of experiments applied on sample +# rename_map: Mapping of renaming fields +# +# Returns: +# result: Formatted values +# """ +# rename_map = rename_map or self.DEFAULT_FORMAT_MAPPING +# +# rename_map.update({f"effect {e.label}": f"Effect {e.label}" for e in experiments}) +# +# result = stat.rename(columns=rename_map) +# result = result.applymap(lambda x: f"{x:,.2f}") +# return result diff --git a/lightautoml/addons/hypex/ABTesting/tests_abtester.py b/lightautoml/addons/hypex/ABTesting/tests_abtester.py new file mode 100644 index 00000000..ade31721 --- /dev/null +++ b/lightautoml/addons/hypex/ABTesting/tests_abtester.py @@ -0,0 +1,7 @@ +from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data + + +def test_aa(): + data = create_test_data(nans_periods=10) + info_col = 'user_id' + model = AATest(data=data, target_fields=['pre_spends', 'post_spends']) diff --git a/lightautoml/addons/hypex/utils/tutorial_data_creation.py b/lightautoml/addons/hypex/utils/tutorial_data_creation.py index 4a497c9a..1ba716d0 100644 --- a/lightautoml/addons/hypex/utils/tutorial_data_creation.py +++ b/lightautoml/addons/hypex/utils/tutorial_data_creation.py @@ -2,18 +2,98 @@ import pandas as pd import sys from pathlib import Path +from typing import Iterable, Union + ROOT = Path('.').absolute().parents[0] sys.path.append(str(ROOT)) -def create_test_data(num_users: int = 10000, file_name: str = None): +def set_nans( + data: pd.DataFrame, + na_step: Union[Iterable[int], int] = None, + nan_cols: Union[Iterable[str], str] = None +): + """Fill some values with NaN/ + + Args: + data: input dataframe + na_step: + num or list of nums of period to make NaN (step of range) + If list - iterates accordingly order of columns + nan_cols: + name of one or several columns to fill with NaN + If list - iterates accordingly order of na_step + + Returns: + data: dataframe with some NaNs + """ + if (nan_cols is not None) or (na_step is not None): + # correct type of columns to iterate + + # number of nans + if na_step is None: + na_step = [10] + print(f'No na_step specified: set to {na_step}') + elif not isinstance(na_step, Iterable): + na_step = [na_step] + + # columns + if nan_cols is None: + nan_cols = list(data.columns) + print('No nan_cols specified. Setting NaNs applied to all columns') + elif not isinstance(nan_cols, Iterable): + nan_cols = [nan_cols] + + # correct length of two lists + if len(na_step) > len(nan_cols): + na_step = na_step[:len(nan_cols)] + print('Length of na_step is bigger than length of columns. Used only first values') + elif len(na_step) < len(nan_cols): + na_step = na_step + [na_step[-1]] * (len(nan_cols) - len(na_step)) + print('Length of na_step is less than length of columns. Used last value several times') + + # create list of indexes to fill with na + nans_indexes = [list(range(i, len(data), period)) for i, period in enumerate(na_step)] + + for i in range(len(nan_cols)): + try: + data.loc[nans_indexes[i], nan_cols[i]] = np.nan + except KeyError: + print(f'There is no column {nan_cols[i]} in data. No nans in this column will be added.') + else: + print('No NaN added') + + return data + + +def create_test_data( + num_users: int = 10000, + na_step: Union[Iterable[int], int] = None, + nan_cols: Union[Iterable[str], str] = None, + file_name: str = None +): + """Creates data for tutorial. + + Args: + num_users: num of strings + na_step: + num or list of nums of period to make NaN (step of range) + If list - iterates accordingly order of columns + nan_cols: + name of one or several columns to fill with NaN + If list - iterates accordingly order of na_step + file_name: name of file to save; doesn't save file if None + + Returns: + data: dataframe with + """ # Simulating dataset with known effect size num_months = 12 # signup_months == 0 means customer did not sign up signup_months = np.random.choice(np.arange(1, num_months), num_users) * np.random.randint(0, 2, size=num_users) - df = pd.DataFrame( + data = pd.DataFrame( { "user_id": np.repeat(np.arange(num_users), num_months), "signup_month": np.repeat(signup_months, num_months), # signup month == 0 means customer did not sign up @@ -23,19 +103,19 @@ def create_test_data(num_users: int = 10000, file_name: str = None): ) # A customer is in the treatment group if and only if they signed up - df["treat"] = df["signup_month"] > 0 + data["treat"] = data["signup_month"] > 0 # Simulating an effect of month (monotonically decreasing--customers buy less later in the year) - df["spend"] = df["spend"] - df["month"] * 10 + data["spend"] = data["spend"] - data["month"] * 10 # Simulating a simple treatment effect of 100 - after_signup = (df["signup_month"] < df["month"]) & (df["treat"]) - df.loc[after_signup, "spend"] = df[after_signup]["spend"] + 100 + after_signup = (data["signup_month"] < data["month"]) & (data["treat"]) + data.loc[after_signup, "spend"] = data[after_signup]["spend"] + 100 # Setting the signup month (for ease of analysis) i = 3 - df_i_signupmonth = ( - df[df.signup_month.isin([0, i])] + data = ( + data[data.signup_month.isin([0, i])] .groupby(["user_id", "signup_month", "treat"]) .apply( lambda x: pd.Series( @@ -46,25 +126,28 @@ def create_test_data(num_users: int = 10000, file_name: str = None): ) # Additional category features - gender_i = np.random.choice(a=[0, 1], size=df_i_signupmonth.user_id.nunique()) + gender_i = np.random.choice(a=[0, 1], size=data.user_id.nunique()) gender = [["M", "F"][i] for i in gender_i] - age = np.random.choice(a=range(18, 70), size=df_i_signupmonth.user_id.nunique()) + age = np.random.choice(a=range(18, 70), size=data.user_id.nunique()) - industry_i = np.random.choice(a=range(1, 3), size=df_i_signupmonth.user_id.nunique()) + industry_i = np.random.choice(a=range(1, 3), size=data.user_id.nunique()) industry_names = ["Finance", "E-commerce", "Logistics"] industry = [industry_names[i] for i in industry_i] - df_i_signupmonth["age"] = age - df_i_signupmonth["gender"] = gender - df_i_signupmonth["industry"] = industry - df_i_signupmonth["industry"] = df_i_signupmonth["industry"].astype("str") - df_i_signupmonth["treat"] = df_i_signupmonth["treat"].astype(int) + data["age"] = age + data["gender"] = gender + data["industry"] = industry + data["industry"] = data["industry"].astype("str") + data["treat"] = data["treat"].astype(int) + + # input nans in data if needed + data = set_nans(data, na_step, nan_cols) if file_name is not None: - df_i_signupmonth.to_csv(ROOT / f"{file_name}.csv", index=False) + data.to_csv(ROOT / f"{file_name}.csv", index=False) - return df_i_signupmonth + return data -create_test_data(num_users=10_000, file_name="Tutorial_data") \ No newline at end of file +# create_test_data(num_users=10_000, file_name="Tutorial_data") \ No newline at end of file From 50cb810dfe495d3ac1f69cfe5f31a88611e98820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=93=D0=BB=D0=B0=D0=B7=D0=BE=D0=B2=D0=B0=20=D0=AF=D0=BD?= =?UTF-8?q?=D0=B0=20=D0=90=D0=BB=D0=B5=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= =?UTF-8?q?=D0=BE=D0=B2=D0=BD=D0=B0?= Date: Mon, 25 Sep 2023 15:03:04 +0300 Subject: [PATCH 09/20] matcher no rep bug fix --- lightautoml/addons/hypex/matcher.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lightautoml/addons/hypex/matcher.py b/lightautoml/addons/hypex/matcher.py index f792eb2f..b0c7676c 100644 --- a/lightautoml/addons/hypex/matcher.py +++ b/lightautoml/addons/hypex/matcher.py @@ -314,22 +314,24 @@ def match_no_rep(self, threshold: float = 0.1) -> pd.DataFrame: X = X.drop(columns=self.info_col) index_matched = MatcherNoReplacement(X, a, self.weights).match() - index_matched = np.concatenate(index_matched.loc[1].iloc[self.input_data[a == 1].index].matches.values) + filtred_matches = index_matched.loc[1].iloc[self.input_data[a == 1].index].matches[index_matched.loc[1].iloc[self.input_data[a == 1].index].matches.apply(lambda x: x != [])] if self.weights is not None: weighted_features = [f for f in self.weights.keys()] index_dict = dict() for w in weighted_features: - source = self.input_data.loc[index_matched][w].values - target = self.input_data[a == 1][w].values + source = self.input_data.loc[np.concatenate(filtred_matches.values)][w].values + target = self.input_data.loc[filtred_matches.index.to_list()][w].values index = abs(source - target) <= abs(source) * threshold index_dict.update({w: index}) index_filtered = sum(index_dict.values()) == len(self.weights) matched_data = pd.concat( - [self.input_data[a == 1].iloc[index_filtered], self.input_data.loc[index_matched].iloc[index_filtered]] + [self.input_data.loc[filtred_matches.index.to_list()].iloc[index_filtered], + self.input_data.loc[np.concatenate(filtred_matches.values)].iloc[index_filtered]] ) else: - matched_data = pd.concat([self.input_data[a == 1], self.input_data.loc[index_matched]]) + matched_data = pd.concat([self.input_data.loc[filtred_matches.index.to_list()], + self.input_data.loc[np.concatenate(filtred_matches.values)]]) return matched_data def lama_feature_select(self) -> pd.DataFrame: From 5cf0b391dabdbbc4b42cb1127c926a0112eeb0e7 Mon Sep 17 00:00:00 2001 From: 20810012 Date: Mon, 25 Sep 2023 16:56:26 +0300 Subject: [PATCH 10/20] AA-test done fully - tests, tutorial and module --- .../addons/hypex/ABTesting/ab_tester.py | 115 ++++++++++-------- .../addons/hypex/ABTesting/tests_abtester.py | 72 ++++++++++- .../hypex/utils/tutorial_data_creation.py | 5 +- 3 files changed, 133 insertions(+), 59 deletions(-) diff --git a/lightautoml/addons/hypex/ABTesting/ab_tester.py b/lightautoml/addons/hypex/ABTesting/ab_tester.py index bd241583..d62355c4 100644 --- a/lightautoml/addons/hypex/ABTesting/ab_tester.py +++ b/lightautoml/addons/hypex/ABTesting/ab_tester.py @@ -15,8 +15,7 @@ def merge_groups( - test_group: Union[Iterable[pd.DataFrame], pd.DataFrame], - control_group: Union[Iterable[pd.DataFrame], pd.DataFrame], + test_group: Union[Iterable[pd.DataFrame], pd.DataFrame], control_group: Union[Iterable[pd.DataFrame], pd.DataFrame], ): """Merges test and control groups in one DataFrame and creates column "group". @@ -44,13 +43,13 @@ def merge_groups( class AATest: def __init__( - self, - data: pd.DataFrame, - target_fields: Union[Iterable[str], str], - info_cols: Union[Iterable[str], str] = None, - mode: str = "simple", - by_group: Union[str, Iterable[str]] = None, - quant_field: str = None + self, + data: pd.DataFrame, + target_fields: Union[Iterable[str], str], + info_cols: Union[Iterable[str], str] = None, + group_cols: Union[str, Iterable[str]] = None, + quant_field: str = None, + mode: str = "simple" ): """ @@ -61,9 +60,11 @@ def __init__( Field with target value mode: Regime to divide sample on A and B samples: - 'simple' - divides by groups, placing equal amount of records (clients | groups of clients) in samples - 'balanced' - divides by size of samples, placing full groups depending on the size of group to balance size of A and B. Can not be applied without groups - by_group: + 'simple' - divides by groups, placing equal amount + of records (clients | groups of clients) in samples + 'balanced' - divides by size of samples, placing full groups depending on the size of group + to balance size of A and B. Can not be applied without groups + group_cols: Name of field(s) for division by groups quant_field: Name of field by which division should take in account common features besides groups @@ -72,9 +73,9 @@ def __init__( self.init_data = data self.target_fields = [target_fields] if isinstance(target_fields, str) else target_fields self.info_cols = [info_cols] if isinstance(info_cols, str) else info_cols - self.mode = mode - self.by_group = by_group + self.group_cols = [group_cols] if isinstance(group_cols, str) else group_cols self.quant_field = quant_field + self.mode = mode self._preprocessing_data() def _preprocessing_data(self): @@ -87,18 +88,34 @@ def _preprocessing_data(self): # categorical to dummies init_cols = data.columns - data = pd.get_dummies(data, dummy_na=True) + + dont_binarize_cols = ( # collects names of columns that shouldn't be binarized + self.group_cols+[self.quant_field] + if (self.group_cols is not None) and (self.quant_field is not None) + else self.group_cols + if self.group_cols is not None + else [self.quant_field] + if self.quant_field is not None + else None + ) + # if self.group_cols is not None: + if dont_binarize_cols is not None: + data = pd.get_dummies(data.drop(columns=dont_binarize_cols), dummy_na=True) + data = data.merge(self.data[dont_binarize_cols], left_index=True, right_index=True) + else: + data = pd.get_dummies(data, dummy_na=True) # fix if dummy_na is const=0 - dummies_cols = [col for col in data.columns if col not in init_cols] # choose only dummy columns - const_columns = [col for col in dummies_cols if data[col].sum() == 0] # choose constant_columns + dummies_cols = set(data.columns) - set(init_cols) + const_columns = [col for col in dummies_cols if data[col].nunique() <= 1] # choose constant_columns cols_to_drop = const_columns + (self.info_cols if self.info_cols is not None else []) self.data = data.drop(columns=cols_to_drop) def __simple_mode(self, data: pd.DataFrame, random_state: int = RANDOM_STATE): - """Separates data on A and B samples. + """Separates data on A and B samples within simple mode. - Separation performed to divide groups of equal sizes - equal amount of records or equal amount of groups in each sample. + Separation performed to divide groups of equal sizes - equal amount of records + or equal amount of groups in each sample. Args: data: Input data @@ -135,13 +152,12 @@ def split_ab(self, random_state: int = RANDOM_STATE) -> Dict: data = self.data result = {"test_indexes": [], "control_indexes": []} - if self.by_group: - groups = data.groupby() + if self.group_cols: + groups = data.groupby(self.group_cols) for _, gd in groups: if self.mode not in ("balanced", "simple"): warnings.warn( - f"The mode '{self.mode}' is not supported for group division. " - f"Implemented mode 'simple'." + f"The mode '{self.mode}' is not supported for group division. Implemented mode 'simple'." ) self.mode = "simple" @@ -165,8 +181,7 @@ def split_ab(self, random_state: int = RANDOM_STATE) -> Dict: else: if self.mode != "simple": warnings.warn( - f"The mode '{self.mode}' is not supported for regular division. " - f"Implemented mode 'simple'." + f"The mode '{self.mode}' is not supported for regular division. " f"Implemented mode 'simple'." ) t_result = self.__simple_mode(data, random_state) @@ -188,17 +203,13 @@ def _postprep_data(self, spit_indexes: Dict = None): data: separated init data with column "group" """ # prep data to show user (add info_cols and decode binary variables) - test = self.init_data.loc[spit_indexes['test_indexes']] - control = self.init_data.loc[spit_indexes['control_indexes']] + test = self.init_data.loc[spit_indexes["test_indexes"]] + control = self.init_data.loc[spit_indexes["control_indexes"]] data = merge_groups(test, control) return data - def sampling_metrics( - self, - alpha: float = 0.05, - random_state: int = RANDOM_STATE - ): + def sampling_metrics(self, alpha: float = 0.05, random_state: int = RANDOM_STATE): """ Args: @@ -215,10 +226,10 @@ def sampling_metrics( t_result = {"random_state": random_state} split = self.split_ab(random_state) - a = self.data.loc[split['test_indexes']] - b = self.data.loc[split['control_indexes']] + a = self.data.loc[split["test_indexes"]] + b = self.data.loc[split["control_indexes"]] - # prep data to show user (add info_cols and decode binary variables) + # prep data to show user (merge indexes and init data) data_from_sampling_dict[random_state] = self._postprep_data(split) for tf in self.target_fields: @@ -235,21 +246,18 @@ def sampling_metrics( scores.append((t_result[f"{tf} t_test p_value"] + t_result[f"{tf} ks_test p_value"]) / 2) t_result["mean_tests_score"] = np.mean(scores) - result = { - 'metrics': t_result, - 'data_from_experiment': data_from_sampling_dict - } + result = {"metrics": t_result, "data_from_experiment": data_from_sampling_dict} return result def search_dist_uniform_sampling( - self, - alpha: float = 0.05, - iterations: int = 10, - file_name: Union[Path, str] = None, - write_mode: str = "full", - write_step: int = None, - pbar: bool = True + self, + alpha: float = 0.05, + iterations: int = 10, + file_name: Union[Path, str] = None, + write_mode: str = "full", + write_step: int = None, + pbar: bool = True, ) -> Optional[tuple[pd.DataFrame, dict[Any, dict]]]: """Chooses random_state for finding homogeneous distribution. @@ -280,26 +288,24 @@ def search_dist_uniform_sampling( data_from_sampling = {} if write_mode not in ("full", "all", "any"): - warnings.warn( - f"Write mode '{write_mode}' is not supported. Mode 'full' will be used" - ) + warnings.warn(f"Write mode '{write_mode}' is not supported. Mode 'full' will be used") write_mode = "full" for i, rs in tqdm(enumerate(random_states), total=len(random_states), display=pbar): res = self.sampling_metrics(alpha=alpha, random_state=rs) - data_from_sampling.update(res['data_from_experiment']) + data_from_sampling.update(res["data_from_experiment"]) # write to file passed = [] for tf in self.target_fields: - passed += [res['metrics'][f"{tf} t_test passed"], res['metrics'][f"{tf} ks_test passed"]] + passed += [res["metrics"][f"{tf} t_test passed"], res["metrics"][f"{tf} ks_test passed"]] if write_mode == "all" and all(passed): - results.append(res['metrics']) + results.append(res["metrics"]) if write_mode == "any" and any(passed): - results.append(res['metrics']) + results.append(res["metrics"]) if write_mode == "full": - results.append(res['metrics']) + results.append(res["metrics"]) if file_name and write_step: if i == write_step: @@ -317,6 +323,7 @@ def search_dist_uniform_sampling( else: return pd.DataFrame(results), data_from_sampling + # class ABTest: # """Calculates metrics - MDE, ATE and p_value.""" # diff --git a/lightautoml/addons/hypex/ABTesting/tests_abtester.py b/lightautoml/addons/hypex/ABTesting/tests_abtester.py index ade31721..a38907e9 100644 --- a/lightautoml/addons/hypex/ABTesting/tests_abtester.py +++ b/lightautoml/addons/hypex/ABTesting/tests_abtester.py @@ -1,7 +1,71 @@ +import pandas as pd +from lightautoml.addons.hypex.ABTesting.ab_tester import AATest from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data -def test_aa(): - data = create_test_data(nans_periods=10) - info_col = 'user_id' - model = AATest(data=data, target_fields=['pre_spends', 'post_spends']) +def test_aa_simple(): + data = create_test_data(rs=52) + info_col = "user_id" + iterations = 20 + + model = AATest(data=data, target_fields=["pre_spends", "post_spends"], info_col=info_col) + res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations) + + assert isinstance(res, pd.DataFrame), "Metrics are not dataframes" + assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \ + "(#rows should be equal #of experiments" + assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess" + assert isinstance(datas_dict, dict), "Result is not dict" + assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations" + assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \ + "the same as columns in initial data" + + +def test_aa_group(): + data = create_test_data(rs=52) + info_col = "user_id" + group_cols = 'industry' + iterations = 20 + + model = AATest( + data=data, + target_fields=["pre_spends", "post_spends"], + info_col=info_col, + group_cols=group_cols + ) + res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations) + + assert isinstance(res, pd.DataFrame), "Metrics are not dataframes" + assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \ + "(#rows should be equal #of experiments" + assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess" + assert isinstance(datas_dict, dict), "Result is not dict" + assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations" + assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \ + "the same as columns in initial data" + + +def test_aa_quantfields(): + data = create_test_data(rs=52) + info_col = "user_id" + group_cols = 'industry' + quant_field = 'gender' + iterations = 20 + + model = AATest( + data=data, + target_fields=["pre_spends", "post_spends"], + info_col=info_col, + group_cols=group_cols, + quant_field=quant_field + ) + res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations) + + assert isinstance(res, pd.DataFrame), "Metrics are not dataframes" + assert res.shape[0] == iterations, "Metrics dataframe contains more or less rows with random states " \ + "(#rows should be equal #of experiments" + assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess" + assert isinstance(datas_dict, dict), "Result is not dict" + assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations" + assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \ + "the same as columns in initial data" \ No newline at end of file diff --git a/lightautoml/addons/hypex/utils/tutorial_data_creation.py b/lightautoml/addons/hypex/utils/tutorial_data_creation.py index 1ba716d0..781627f8 100644 --- a/lightautoml/addons/hypex/utils/tutorial_data_creation.py +++ b/lightautoml/addons/hypex/utils/tutorial_data_creation.py @@ -70,7 +70,8 @@ def create_test_data( num_users: int = 10000, na_step: Union[Iterable[int], int] = None, nan_cols: Union[Iterable[str], str] = None, - file_name: str = None + file_name: str = None, + rs = None ): """Creates data for tutorial. @@ -87,6 +88,8 @@ def create_test_data( Returns: data: dataframe with """ + if rs is not None: + np.random.seed(rs) # Simulating dataset with known effect size num_months = 12 From c069258c2088f17465373ab4925c8d46f6e514c6 Mon Sep 17 00:00:00 2001 From: 20810012 Date: Tue, 26 Sep 2023 09:45:45 +0300 Subject: [PATCH 11/20] tests corrected --- lightautoml/addons/hypex/ABTesting/ab_tester.py | 2 +- .../hypex/{ABTesting => tests}/tests_abtester.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) rename lightautoml/addons/hypex/{ABTesting => tests}/tests_abtester.py (91%) diff --git a/lightautoml/addons/hypex/ABTesting/ab_tester.py b/lightautoml/addons/hypex/ABTesting/ab_tester.py index d62355c4..5e34dfdf 100644 --- a/lightautoml/addons/hypex/ABTesting/ab_tester.py +++ b/lightautoml/addons/hypex/ABTesting/ab_tester.py @@ -291,7 +291,7 @@ def search_dist_uniform_sampling( warnings.warn(f"Write mode '{write_mode}' is not supported. Mode 'full' will be used") write_mode = "full" - for i, rs in tqdm(enumerate(random_states), total=len(random_states), display=pbar): + for i, rs in tqdm(enumerate(random_states), total=len(random_states)):#, display=pbar): res = self.sampling_metrics(alpha=alpha, random_state=rs) data_from_sampling.update(res["data_from_experiment"]) diff --git a/lightautoml/addons/hypex/ABTesting/tests_abtester.py b/lightautoml/addons/hypex/tests/tests_abtester.py similarity index 91% rename from lightautoml/addons/hypex/ABTesting/tests_abtester.py rename to lightautoml/addons/hypex/tests/tests_abtester.py index a38907e9..5a1ce94c 100644 --- a/lightautoml/addons/hypex/ABTesting/tests_abtester.py +++ b/lightautoml/addons/hypex/tests/tests_abtester.py @@ -8,7 +8,11 @@ def test_aa_simple(): info_col = "user_id" iterations = 20 - model = AATest(data=data, target_fields=["pre_spends", "post_spends"], info_col=info_col) + model = AATest( + data=data, + target_fields=["pre_spends", "post_spends"], + info_cols=info_col + ) res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations) assert isinstance(res, pd.DataFrame), "Metrics are not dataframes" @@ -30,7 +34,7 @@ def test_aa_group(): model = AATest( data=data, target_fields=["pre_spends", "post_spends"], - info_col=info_col, + info_cols=info_col, group_cols=group_cols ) res, datas_dict = model.search_dist_uniform_sampling(iterations=iterations) @@ -42,7 +46,8 @@ def test_aa_group(): assert isinstance(datas_dict, dict), "Result is not dict" assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations" assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \ - "the same as columns in initial data" + "the same as columns in initial " \ + "data " def test_aa_quantfields(): @@ -55,7 +60,7 @@ def test_aa_quantfields(): model = AATest( data=data, target_fields=["pre_spends", "post_spends"], - info_col=info_col, + info_cols=info_col, group_cols=group_cols, quant_field=quant_field ) From f2c3cf29f30c5d3aaaa4c444bf095d4736a7eb69 Mon Sep 17 00:00:00 2001 From: 20810012 Date: Tue, 26 Sep 2023 11:01:14 +0300 Subject: [PATCH 12/20] tests corrected (black) --- lightautoml/addons/hypex/tests/tests_abtester.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lightautoml/addons/hypex/tests/tests_abtester.py b/lightautoml/addons/hypex/tests/tests_abtester.py index 5a1ce94c..e8dd5d46 100644 --- a/lightautoml/addons/hypex/tests/tests_abtester.py +++ b/lightautoml/addons/hypex/tests/tests_abtester.py @@ -22,7 +22,8 @@ def test_aa_simple(): assert isinstance(datas_dict, dict), "Result is not dict" assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations" assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \ - "the same as columns in initial data" + "the same as columns in initial " \ + "data " def test_aa_group(): @@ -73,4 +74,5 @@ def test_aa_quantfields(): assert isinstance(datas_dict, dict), "Result is not dict" assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations" assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \ - "the same as columns in initial data" \ No newline at end of file + "the same as columns in initial " \ + "data " From 859dac0126891427831f9cf8cee697967ce2398c Mon Sep 17 00:00:00 2001 From: 20810012 Date: Tue, 26 Sep 2023 16:12:38 +0300 Subject: [PATCH 13/20] tutorial in AA + metrics in AB --- Tutorial_13_ABtesting.ipynb | 1881 +++++++++++++++++++++++++++++++++++ 1 file changed, 1881 insertions(+) create mode 100644 Tutorial_13_ABtesting.ipynb diff --git a/Tutorial_13_ABtesting.ipynb b/Tutorial_13_ABtesting.ipynb new file mode 100644 index 00000000..b7bf39a2 --- /dev/null +++ b/Tutorial_13_ABtesting.ipynb @@ -0,0 +1,1881 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "eaaddf5e", + "metadata": {}, + "source": [ + "1. Привести в туториале пример группировки и квантизации\n", + "3. AB-test сделать \n", + " - разобраться в пайплайне и сделать его\n", + " - вставить info_col\n", + " - добавить метрики cuped и did\n", + "2. написать тесты\n", + "3. как работать с пропусками в АА (ttest)\n", + "- название переменных поправить\n", + "\n", + "**Done**\n", + "- протестить один и несколько таргетов\n", + "- добавить info_col (iterable или str) в AAtest" + ] + }, + { + "cell_type": "markdown", + "id": "0aa723ec", + "metadata": {}, + "source": [ + "# How to perform AA or AB tests" + ] + }, + { + "cell_type": "markdown", + "id": "0d3be579", + "metadata": {}, + "source": [ + "## 0. Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "9af0ac46", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from lightautoml.addons.hypex.ABTesting.ab_tester import AATest#, ABTest\n", + "from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data\n", + "\n", + "np.random.seed(52) #needed to create example data" + ] + }, + { + "cell_type": "markdown", + "id": "317f21f2", + "metadata": {}, + "source": [ + "## 1. Create or upload your dataset\n", + "In this case we will create random dataset with known effect size \n", + "If you have your own dataset, go to the part 2 " + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "946d5c6a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No NaN added\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsignup_monthtreatpre_spendspost_spendsagegenderindustry
0000488.0414.44444467ME-commerce
1300501.5424.33333331MLogistics
21000522.5416.22222264ME-commerce
31200472.0423.77777843ME-commerce
41300508.5424.22222236FE-commerce
...........................
5365999100482.5421.88888923FE-commerce
5366999200491.5424.00000044ME-commerce
5367999400486.0423.77777827FLogistics
5368999600500.5430.88888956FE-commerce
5369999731473.0534.11111156MLogistics
\n", + "

5370 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " user_id signup_month treat pre_spends post_spends age gender \\\n", + "0 0 0 0 488.0 414.444444 67 M \n", + "1 3 0 0 501.5 424.333333 31 M \n", + "2 10 0 0 522.5 416.222222 64 M \n", + "3 12 0 0 472.0 423.777778 43 M \n", + "4 13 0 0 508.5 424.222222 36 F \n", + "... ... ... ... ... ... ... ... \n", + "5365 9991 0 0 482.5 421.888889 23 F \n", + "5366 9992 0 0 491.5 424.000000 44 M \n", + "5367 9994 0 0 486.0 423.777778 27 F \n", + "5368 9996 0 0 500.5 430.888889 56 F \n", + "5369 9997 3 1 473.0 534.111111 56 M \n", + "\n", + " industry \n", + "0 E-commerce \n", + "1 Logistics \n", + "2 E-commerce \n", + "3 E-commerce \n", + "4 E-commerce \n", + "... ... \n", + "5365 E-commerce \n", + "5366 E-commerce \n", + "5367 Logistics \n", + "5368 E-commerce \n", + "5369 Logistics \n", + "\n", + "[5370 rows x 8 columns]" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = create_test_data(rs=52)\n", + "data" + ] + }, + { + "cell_type": "markdown", + "id": "4d1d05d4", + "metadata": {}, + "source": [ + "## 2. AATest" + ] + }, + { + "cell_type": "markdown", + "id": "81aec791", + "metadata": {}, + "source": [ + "### 2.0 Initialize parameters\n", + "info_col used to define informative attributes that should NOT be part of testing, such as user_id and signup_month
" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "b8156de6", + "metadata": {}, + "outputs": [], + "source": [ + "info_cols = ['user_id', 'signup_month']\n", + "target = 'post_spends'" + ] + }, + { + "cell_type": "markdown", + "id": "691d7549", + "metadata": {}, + "source": [ + "### 2.1 Simple AA-test\n", + "This is the easiest way to initialize and calculate metrics on a AA-test on 10 iterations
\n", + "Use it when you are clear about each attribute or if you don't have any additional task conditions (like grouping)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "d0677a8f", + "metadata": {}, + "outputs": [], + "source": [ + "experiment = AATest(data=data, info_cols=info_cols, target_fields=target)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "dc160d89", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0ea83b02b7174bbfb6eb8e54fd224b0d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/10 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
random_statepost_spends a meanpost_spends b meanpost_spends ab delta %post_spends t_test p_valuepost_spends ks_test p_valuepost_spends t_test passedpost_spends ks_test passedmean_tests_score
00427.848003427.8480030.01.01.0TrueTrue1.0
11427.672046427.6720460.01.01.0TrueTrue1.0
22428.380095428.3800950.01.01.0TrueTrue1.0
\n", + "" + ], + "text/plain": [ + " random_state post_spends a mean post_spends b mean \\\n", + "0 0 427.848003 427.848003 \n", + "1 1 427.672046 427.672046 \n", + "2 2 428.380095 428.380095 \n", + "\n", + " post_spends ab delta % post_spends t_test p_value \\\n", + "0 0.0 1.0 \n", + "1 0.0 1.0 \n", + "2 0.0 1.0 \n", + "\n", + " post_spends ks_test p_value post_spends t_test passed \\\n", + "0 1.0 True \n", + "1 1.0 True \n", + "2 1.0 True \n", + "\n", + " post_spends ks_test passed mean_tests_score \n", + "0 True 1.0 \n", + "1 True 1.0 \n", + "2 True 1.0 " + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiment_result, dict_of_datas = experiment.search_dist_uniform_sampling()\n", + "experiment_result.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "d57d3639", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsignup_monthtreatpre_spendspost_spendsagegenderindustrygroup
0300501.5424.33333331MLogisticstest
11000522.5416.22222264ME-commercetest
21200472.0423.77777843ME-commercetest
31300508.5424.22222236FE-commercetest
41400497.0421.77777826MLogisticstest
..............................
5365998700467.0431.55555648MLogisticscontrol
5366998800501.5423.22222255FLogisticscontrol
5367999100482.5421.88888923FE-commercecontrol
5368999200491.5424.00000044ME-commercecontrol
5369999600500.5430.88888956FE-commercecontrol
\n", + "

5370 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " user_id signup_month treat pre_spends post_spends age gender \\\n", + "0 3 0 0 501.5 424.333333 31 M \n", + "1 10 0 0 522.5 416.222222 64 M \n", + "2 12 0 0 472.0 423.777778 43 M \n", + "3 13 0 0 508.5 424.222222 36 F \n", + "4 14 0 0 497.0 421.777778 26 M \n", + "... ... ... ... ... ... ... ... \n", + "5365 9987 0 0 467.0 431.555556 48 M \n", + "5366 9988 0 0 501.5 423.222222 55 F \n", + "5367 9991 0 0 482.5 421.888889 23 F \n", + "5368 9992 0 0 491.5 424.000000 44 M \n", + "5369 9996 0 0 500.5 430.888889 56 F \n", + "\n", + " industry group \n", + "0 Logistics test \n", + "1 E-commerce test \n", + "2 E-commerce test \n", + "3 E-commerce test \n", + "4 Logistics test \n", + "... ... ... \n", + "5365 Logistics control \n", + "5366 Logistics control \n", + "5367 E-commerce control \n", + "5368 E-commerce control \n", + "5369 E-commerce control \n", + "\n", + "[5370 rows x 9 columns]" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict_of_datas[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "46aa41af", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a\n" + ] + } + ], + "source": [ + "if all(data.columns) == all(dict_of_datas[0].drop(columns=['group']).columns):\n", + " print('a')\n", + "else:\n", + " print('b')" + ] + }, + { + "cell_type": "markdown", + "id": "fb387062", + "metadata": {}, + "source": [ + "#### 2.1.0 Single experiment\n", + "To perform single experiment you can use sampling_metrics()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "ac3dc471", + "metadata": {}, + "outputs": [], + "source": [ + "random_state = 11" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "90c346c7", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'random_state': 11,\n", + " 'post_spends a mean': 427.78932340161515,\n", + " 'post_spends b mean': 427.78932340161515,\n", + " 'post_spends ab delta %': 0.0,\n", + " 'post_spends t_test p_value': 1.0,\n", + " 'post_spends ks_test p_value': 1.0,\n", + " 'post_spends t_test passed': True,\n", + " 'post_spends ks_test passed': True,\n", + " 'mean_tests_score': 1.0}" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res = experiment.sampling_metrics(random_state=random_state)\n", + "metrics, dict_of_datas = res['metrics'], res['data_from_experiment']\n", + "metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "08ba416c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsignup_monthtreatpre_spendspost_spendsagegenderindustrygroup
0300501.5424.33333331MLogisticstest
11400497.0421.77777826MLogisticstest
22100489.0433.11111130ME-commercetest
32831479.5527.88888920FE-commercetest
42900505.0414.33333330ME-commercetest
..............................
5365998800501.5423.22222255FLogisticscontrol
5366999000490.0426.00000018ME-commercecontrol
5367999100482.5421.88888923FE-commercecontrol
5368999200491.5424.00000044ME-commercecontrol
5369999600500.5430.88888956FE-commercecontrol
\n", + "

5370 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " user_id signup_month treat pre_spends post_spends age gender \\\n", + "0 3 0 0 501.5 424.333333 31 M \n", + "1 14 0 0 497.0 421.777778 26 M \n", + "2 21 0 0 489.0 433.111111 30 M \n", + "3 28 3 1 479.5 527.888889 20 F \n", + "4 29 0 0 505.0 414.333333 30 M \n", + "... ... ... ... ... ... ... ... \n", + "5365 9988 0 0 501.5 423.222222 55 F \n", + "5366 9990 0 0 490.0 426.000000 18 M \n", + "5367 9991 0 0 482.5 421.888889 23 F \n", + "5368 9992 0 0 491.5 424.000000 44 M \n", + "5369 9996 0 0 500.5 430.888889 56 F \n", + "\n", + " industry group \n", + "0 Logistics test \n", + "1 Logistics test \n", + "2 E-commerce test \n", + "3 E-commerce test \n", + "4 E-commerce test \n", + "... ... ... \n", + "5365 Logistics control \n", + "5366 E-commerce control \n", + "5367 E-commerce control \n", + "5368 E-commerce control \n", + "5369 E-commerce control \n", + "\n", + "[5370 rows x 9 columns]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict_of_datas[random_state]" + ] + }, + { + "cell_type": "markdown", + "id": "4912e5df", + "metadata": {}, + "source": [ + "### 2.2 AA-test with grouping" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "b274e7bb", + "metadata": {}, + "outputs": [], + "source": [ + "info_cols = ['user_id', 'signup_month']\n", + "target = 'post_spends'\n", + "\n", + "group_cols = 'industry'" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "d14469a5", + "metadata": {}, + "outputs": [], + "source": [ + "experiment = AATest(data=data, info_cols=info_cols, target_fields=target, group_cols=group_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "0e5ac552", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5c7d1ef6dfaa46b782ab990a763fca1f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/10 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
random_statepost_spends a meanpost_spends b meanpost_spends ab delta %post_spends t_test p_valuepost_spends ks_test p_valuepost_spends t_test passedpost_spends ks_test passedmean_tests_score
00428.882431428.8824310.01.01.0TrueTrue1.0
11428.602956428.6029560.01.01.0TrueTrue1.0
22428.846995428.8469950.01.01.0TrueTrue1.0
\n", + "" + ], + "text/plain": [ + " random_state post_spends a mean post_spends b mean \\\n", + "0 0 428.882431 428.882431 \n", + "1 1 428.602956 428.602956 \n", + "2 2 428.846995 428.846995 \n", + "\n", + " post_spends ab delta % post_spends t_test p_value \\\n", + "0 0.0 1.0 \n", + "1 0.0 1.0 \n", + "2 0.0 1.0 \n", + "\n", + " post_spends ks_test p_value post_spends t_test passed \\\n", + "0 1.0 True \n", + "1 1.0 True \n", + "2 1.0 True \n", + "\n", + " post_spends ks_test passed mean_tests_score \n", + "0 True 1.0 \n", + "1 True 1.0 \n", + "2 True 1.0 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiment_result, dict_of_datas = experiment.search_dist_uniform_sampling()\n", + "experiment_result.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "efdc426d", + "metadata": {}, + "source": [ + "### 2.3 AA-testing with quantifying" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "56ed0a88", + "metadata": {}, + "outputs": [], + "source": [ + "info_cols = ['user_id', 'signup_month']\n", + "target = 'post_spends'\n", + "\n", + "group_cols = 'industry'\n", + "quant_field = 'gender'" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "7872cf08", + "metadata": {}, + "outputs": [], + "source": [ + "experiment = AATest(data=data, info_cols=info_cols, target_fields=target, group_cols=group_cols, quant_field=quant_field)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "32827bbe", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "75c1de04cc5b4e8288545baa02ea3ccc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/10 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
random_statepost_spends a meanpost_spends b meanpost_spends ab delta %post_spends t_test p_valuepost_spends ks_test p_valuepost_spends t_test passedpost_spends ks_test passedmean_tests_score
00428.347612428.3476120.01.01.0TrueTrue1.0
11427.967721427.9677210.01.01.0TrueTrue1.0
22428.347612428.3476120.01.01.0TrueTrue1.0
\n", + "" + ], + "text/plain": [ + " random_state post_spends a mean post_spends b mean \\\n", + "0 0 428.347612 428.347612 \n", + "1 1 427.967721 427.967721 \n", + "2 2 428.347612 428.347612 \n", + "\n", + " post_spends ab delta % post_spends t_test p_value \\\n", + "0 0.0 1.0 \n", + "1 0.0 1.0 \n", + "2 0.0 1.0 \n", + "\n", + " post_spends ks_test p_value post_spends t_test passed \\\n", + "0 1.0 True \n", + "1 1.0 True \n", + "2 1.0 True \n", + "\n", + " post_spends ks_test passed mean_tests_score \n", + "0 True 1.0 \n", + "1 True 1.0 \n", + "2 True 1.0 " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiment_result, dict_of_datas = experiment.search_dist_uniform_sampling()\n", + "experiment_result.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1b57c6d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1269e282", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "3c8f6bf5", + "metadata": {}, + "source": [ + "# Metrics" + ] + }, + { + "cell_type": "markdown", + "id": "8226aef5", + "metadata": {}, + "source": [ + "## Data" + ] + }, + { + "cell_type": "code", + "execution_count": 244, + "id": "566fb273", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pre_spendspost_spendsagegender_Fgender_Mindustry_E-commerceindustry_Logisticsgroup
0501.5424.333333310101test
1522.5416.222222640110test
2472.0423.777778430110test
3508.5424.222222361010test
4497.0421.777778260101test
...........................
5365467.0431.555556480101control
5366501.5423.222222551001control
5367482.5421.888889231010control
5368491.5424.000000440110control
5369500.5430.888889561010control
\n", + "

5370 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " pre_spends post_spends age gender_F gender_M industry_E-commerce \\\n", + "0 501.5 424.333333 31 0 1 0 \n", + "1 522.5 416.222222 64 0 1 1 \n", + "2 472.0 423.777778 43 0 1 1 \n", + "3 508.5 424.222222 36 1 0 1 \n", + "4 497.0 421.777778 26 0 1 0 \n", + "... ... ... ... ... ... ... \n", + "5365 467.0 431.555556 48 0 1 0 \n", + "5366 501.5 423.222222 55 1 0 0 \n", + "5367 482.5 421.888889 23 1 0 1 \n", + "5368 491.5 424.000000 44 0 1 1 \n", + "5369 500.5 430.888889 56 1 0 1 \n", + "\n", + " industry_Logistics group \n", + "0 1 test \n", + "1 0 test \n", + "2 0 test \n", + "3 0 test \n", + "4 1 test \n", + "... ... ... \n", + "5365 1 control \n", + "5366 1 control \n", + "5367 0 control \n", + "5368 0 control \n", + "5369 0 control \n", + "\n", + "[5370 rows x 8 columns]" + ] + }, + "execution_count": 244, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prep = AATest(data=dict_of_datas[0], target_fields='pre_spends', info_cols=info_cols, group_cols='group')\n", + "data = prep.data\n", + "data" + ] + }, + { + "cell_type": "markdown", + "id": "b70c9f3d", + "metadata": {}, + "source": [ + "## Diff in means" + ] + }, + { + "cell_type": "markdown", + "id": "8cab0659", + "metadata": {}, + "source": [ + "$$\\widehat {ATE}^{simple} = \\bar Y_{t=1, d=1} - \\bar Y_{t=1, d=0}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 300, + "id": "90506e7e", + "metadata": {}, + "outputs": [], + "source": [ + "def diff_in_means(data:pd.DataFrame, target_col: str, groups_col: str): \n", + " \"\"\"Counts difference in means of test and conrol group in target.\n", + " \n", + " Args:\n", + " data: input dataframe\n", + " target_col: name of target column\n", + " groups_col: name of column with separation on test and control group\n", + " should contain values ('test', 'control')\n", + " \n", + " Returns:\n", + " difm: differemnce in means between test and control groups\n", + " \n", + " \"\"\"\n", + " \n", + " mean_test = np.mean(data[data[groups_col]=='test'][target_col]) \n", + " mean_control = np.mean(data[data[groups_col]=='control'][target_col])\n", + " difm = mean_test - mean_control\n", + " \n", + " return difm" + ] + }, + { + "cell_type": "code", + "execution_count": 301, + "id": "47c09f67", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 301, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diff_in_means(data, 'post_spends', 'group')" + ] + }, + { + "cell_type": "markdown", + "id": "9afc5278", + "metadata": {}, + "source": [ + "## Cuped" + ] + }, + { + "cell_type": "markdown", + "id": "329ea2ae", + "metadata": {}, + "source": [ + "$$\\hat Y^{cuped}_{1} = \\bar Y_1 - \\theta \\bar X + \\theta \\mathbb E [X]$$\n", + "\n", + "$$\n", + "\\begin{aligned}\n", + "\\text{Var} \\left( \\hat Y^{cuped}_{1} \\right) &= \\text{Var} \\left( \\bar Y_1 - \\theta \\bar X + \\theta \\mathbb E [X] \\right) = \\newline\n", + "&= \\text{Var} \\left( Y_1 - \\theta X \\right) / n = \\newline\n", + "&= \\Big( \\text{Var} (Y_1) + \\theta^2 \\text{Var} (X) - 2 \\theta \\text{Cov} (X,Y) \\Big) / n\n", + "\\end{aligned}$$\n", + "\n", + "$$\n", + "\\theta^* = \\frac{\\text{Cov} (X,Y)}{\\text{Var} (X)}\n", + "$$\n", + "\n", + "$$\n", + "\\text{Var} \\left( \\hat Y^{cuped}_{1} \\right) = \\text{Var} (\\bar Y) (1 - \\rho^2), \\text{where}\\;\\rho=corr(X,Y)\n", + "$$\n", + "\n", + "$$\n", + "\\begin{aligned} \n", + "\\widehat {ATE}^{cuped} &= \\hat Y^{cuped}_{1} (D=1) - \\hat Y^{cuped}_{1}(D=0) = \\newline &= \\big( \\bar Y_1 - \\theta \\bar X + \\theta \\mathbb E [X] \\ \\big| \\ D = 1 \\big) - \\big( \\bar Y_1 - \\theta \\bar X + \\theta \\mathbb E [X] \\ \\big| \\ D = 0 \\big) = \\newline &= \\big( \\bar Y_1 - \\theta \\bar X \\ \\big| \\ D = 1 \\big) - \\big( \\bar Y_1 - \\theta \\bar X \\ \\big| \\ D = 0 \\big) \\end{aligned}\n", + "$$\n", + "\n", + "$$\n", + "\\hat Y_{cuped,1} = \\bar Y_1 - \\theta \\bar X\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 302, + "id": "bb64a84e", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import namedtuple\n", + "ExperimentComparisonResults = namedtuple('ExperimentComparisonResults', \n", + " ['pvalue', 'effect', 'ci_length', 'left_bound', 'right_bound'])" + ] + }, + { + "cell_type": "markdown", + "id": "4afb0b8c", + "metadata": {}, + "source": [ + "#### Absolute" + ] + }, + { + "cell_type": "code", + "execution_count": 303, + "id": "bf1a2375", + "metadata": {}, + "outputs": [], + "source": [ + "import scipy.stats as ss\n", + "from typing import Union\n", + "\n", + "def absolute_ttest(\n", + " control: Union[pd.DataFrame, pd.Series], \n", + " test: Union[pd.DataFrame, pd.Series], \n", + " control_before: Union[pd.DataFrame, pd.Series], \n", + " test_before: Union[pd.DataFrame, pd.Series]\n", + "):\n", + " \"\"\"Counts CUPED (Controlled-Experiment using Pre-Experiment Data) in absolute values.\n", + " \n", + " Args:\n", + " control: target data after pilot in control group\n", + " test: target data after pilot in test group\n", + " \n", + " Returns:\n", + " result: named tuple with pvalue, effect, ci_length, left_bound and right_bound\n", + " \"\"\"\n", + " theta = (np.cov(control, control_before)[0, 1] + np.cov(test, test_before)[0, 1]) \\\n", + " / (np.var(control_before) + np.var(test_before))\n", + "\n", + " control = control - theta * control_before\n", + " test = test - theta * test_before\n", + " \n", + " mean_control = np.mean(control)\n", + " mean_test = np.mean(test)\n", + " var_mean_control = np.var(control) / len(control)\n", + " var_mean_test = np.var(test) / len(test)\n", + " \n", + " difference_mean = mean_test - mean_control\n", + " difference_mean_var = var_mean_control + var_mean_test\n", + " difference_distribution = ss.norm(loc=difference_mean, scale=np.sqrt(difference_mean_var))\n", + "\n", + " left_bound, right_bound = difference_distribution.ppf([0.025, 0.975])\n", + " ci_length = (right_bound - left_bound)\n", + " pvalue = 2 * min(difference_distribution.cdf(0), difference_distribution.sf(0))\n", + " effect = difference_mean\n", + " result = ExperimentComparisonResults(pvalue, effect, ci_length, left_bound, right_bound)\n", + "\n", + " return result\n", + "\n", + "# https://github.com/DimaLunin/AB_lifehacks/blob/8fbbcc6a82440bdb593ac670994f1f8c7a5ef2f3/CUPED.ipynb#L57" + ] + }, + { + "cell_type": "code", + "execution_count": 304, + "id": "ac70f2bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ExperimentComparisonResults(pvalue=1.0, effect=0.0, ci_length=2.973441563457471, left_bound=-1.4867207817287356, right_bound=1.4867207817287353)" + ] + }, + "execution_count": 304, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "absolute_ttest(\n", + " data[data['group']=='control']['post_spends'],\n", + " data[data['group']=='test']['post_spends'],\n", + " data[data['group']=='control']['pre_spends'],\n", + " data[data['group']=='test']['pre_spends']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "050ead63", + "metadata": {}, + "source": [ + "#### Relative" + ] + }, + { + "cell_type": "code", + "execution_count": 305, + "id": "d852dca8", + "metadata": {}, + "outputs": [], + "source": [ + "def relative_cuped(\n", + " control: Union[pd.DataFrame, pd.Series], \n", + " test: Union[pd.DataFrame, pd.Series], \n", + " control_before: Union[pd.DataFrame, pd.Series], \n", + " test_before: Union[pd.DataFrame, pd.Series]\n", + "):\n", + " theta = (np.cov(control, control_before)[0, 1] + np.cov(test, test_before)[0, 1]) /\\\n", + " (np.var(control_before) + np.var(test_before))\n", + "\n", + " control_cup = control - theta * control_before\n", + " test_cup = test - theta * test_before\n", + "\n", + " mean_den = np.mean(control)\n", + " mean_num = np.mean(test_cup) - np.mean(control_cup)\n", + " var_mean_den = np.var(control) / len(control)\n", + " var_mean_num = np.var(test_cup) / len(test_cup) + np.var(control_cup) / len(control_cup)\n", + "\n", + " cov = -np.cov(control_cup, control)[0, 1] / len(control)\n", + "\n", + " relative_mu = mean_num / mean_den\n", + " relative_var = var_mean_num / (mean_den ** 2) + var_mean_den * ((mean_num ** 2) / (mean_den ** 4))\\\n", + " - 2 * (mean_num / (mean_den ** 3)) * cov\n", + " \n", + " relative_distribution = ss.norm(loc=relative_mu, scale=np.sqrt(relative_var))\n", + " left_bound, right_bound = relative_distribution.ppf([0.025, 0.975])\n", + " \n", + " ci_length = (right_bound - left_bound)\n", + " pvalue = 2 * min(relative_distribution.cdf(0), relative_distribution.sf(0))\n", + " effect = relative_mu\n", + " return ExperimentComparisonResults(pvalue, effect, ci_length, left_bound, right_bound)" + ] + }, + { + "cell_type": "code", + "execution_count": 306, + "id": "91ae9bfe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ExperimentComparisonResults(pvalue=1.0, effect=0.0, ci_length=0.006949761458391225, left_bound=-0.0034748807291956124, right_bound=0.003474880729195612)" + ] + }, + "execution_count": 306, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "relative_cuped(\n", + " data[data['group']=='control']['post_spends'],\n", + " data[data['group']=='test']['post_spends'],\n", + " data[data['group']=='control']['pre_spends'],\n", + " data[data['group']=='test']['pre_spends']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8d6c9119", + "metadata": {}, + "source": [ + "#### Diff in diff" + ] + }, + { + "cell_type": "markdown", + "id": "005d82a1", + "metadata": {}, + "source": [ + "$$\\widehat {ATE}^{DiD} = \\big( \\bar Y_{t=1, d=1} - \\bar Y_{t=1, d=0} \\big) - \\big( \\bar Y_{t=0, d=1} - \\bar Y_{t=0, d=0} \\big)$$" + ] + }, + { + "cell_type": "code", + "execution_count": 307, + "id": "f41e9626", + "metadata": {}, + "outputs": [], + "source": [ + "def diff_in_diff(data, target_col, target_col_before, groups_col):\n", + " \"\"\"Counts Difference in Difference.\n", + " \n", + " Metric uses pre-post analisys and counts differece in means in data before and after pilot:\n", + " ATE = (y_test_after - y_control_after) - (y_test_before - y_control_before)\n", + " \n", + " Args:\n", + " data: input data\n", + " target_col: column name of target after pilot\n", + " target_col_before: column name of target before pilot\n", + " groups_col: name of column with separation on test and control group\n", + " should contain values ('test', 'control')\n", + " \n", + " Returns:\n", + " did: value of difference in difference\n", + " \"\"\"\n", + " mean_test = np.mean(data[data[groups_col]=='test'][target_col]) \n", + " mean_control = np.mean(data[data[groups_col]=='control'][target_col])\n", + " \n", + " mean_test_before = np.mean(data[data[groups_col]=='test'][target_col_before]) \n", + " mean_control_before = np.mean(data[data[groups_col]=='control'][target_col_before])\n", + " did = (mean_test - mean_control) - (mean_test_before-mean_control_before)\n", + " \n", + " return did" + ] + }, + { + "cell_type": "code", + "execution_count": 308, + "id": "3df6d620", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 308, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diff_in_diff(data, 'post_spends', 'pre_spends', 'group')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cd8835c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 45d9e5542e7c70e56fb663a71672f6f0f8806c67 Mon Sep 17 00:00:00 2001 From: 20810012 Date: Wed, 27 Sep 2023 10:35:05 +0300 Subject: [PATCH 14/20] AB and ab_test added --- .../addons/hypex/ABTesting/ab_tester.py | 218 ++++---- lightautoml/addons/hypex/ab_spliter.py | 474 ------------------ .../{tests_abtester.py => test_aa_test.py} | 1 + .../addons/hypex/tests/tests_ab_test.py | 58 +++ 4 files changed, 163 insertions(+), 588 deletions(-) delete mode 100644 lightautoml/addons/hypex/ab_spliter.py rename lightautoml/addons/hypex/tests/{tests_abtester.py => test_aa_test.py} (99%) create mode 100644 lightautoml/addons/hypex/tests/tests_ab_test.py diff --git a/lightautoml/addons/hypex/ABTesting/ab_tester.py b/lightautoml/addons/hypex/ABTesting/ab_tester.py index 5e34dfdf..e76aac08 100644 --- a/lightautoml/addons/hypex/ABTesting/ab_tester.py +++ b/lightautoml/addons/hypex/ABTesting/ab_tester.py @@ -9,7 +9,7 @@ import pandas as pd import numpy as np import scipy.stats as stats -from scipy.stats import norm, ttest_ind, ks_2samp +from scipy.stats import norm, ttest_ind, ks_2samp, mannwhitneyu RANDOM_STATE = 52 @@ -324,116 +324,106 @@ def search_dist_uniform_sampling( return pd.DataFrame(results), data_from_sampling -# class ABTest: -# """Calculates metrics - MDE, ATE and p_value.""" -# -# DEFAULT_FORMAT_MAPPING = { -# "rs": "random state", -# "mde": "MDE", -# "sample_size": "Size of test sample", -# "a_len": "Size of target group", -# "b_len": "Size of control group", -# "a_mean": "Mean of target group", -# "b_mean": "Mean of control group", -# } -# -# def __init__( -# self, target_field: str, reliability: float = 0.95, power: float = 0.8, mde: float = None, -# ): -# """ -# Args: -# splitter: -# Class of divider on A and B groups -# target_field: -# Field with target values -# reliability: -# Level of statistical reliability, usually equals 0.95 -# power: -# Statistical criterion power, usually equals 0.8 -# mde: -# Calculated mde (minimal detected effect), -# if none - calculates inside -# """ -# self.test = test_data -# self.control = control_data -# self.target_field = target_field -# self.reliability = reliability -# self.power = power -# self.mde = mde -# -# def sampling_test( -# self, data: pd.DataFrame, experiments = None, random_state: int = None, -# ) -> Dict: -# """Test on specific sample. -# -# Args: -# data: Input data -# experiments: Experiment or set of experiments applied on sample -# random_state: Seed of random -# -# Returns: -# result: Test results -# """ -# -# # split = self.splitter.split_ab(data, random_state) -# # if isinstance(experiments, ABExperiment): -# # experiments = [experiments] -# -# mde = self.mde or calc_mde( -# data.loc[self.test, self.target_field], -# data.loc[self.control, self.target_field], -# reliability=self.reliability, -# power=self.power, -# ) -# sample_size = calc_sample_size( -# data.loc[self.test, self.target_field], -# data.loc[self.control, self.target_field], -# mde, -# significance=(1 - self.reliability), -# power=self.power, -# ) -# -# result = { -# "rs": random_state, -# "mde": mde, -# "sample_size": sample_size, -# "a_len": len(self.test), -# "b_len": len(self.control), -# "a_mean": data.loc[self.test, self.target_field].mean(), -# "b_mean": data.loc[self.control, self.target_field].mean(), -# } -# # включить класс из ноута -# for e in experiments: #: как считается эффект написано в эксперименте, перенести в calc_effect -# """ -# сделать разницу средних в наследнике класса (новый класс создать) -# на альфе в к7м ABTesting, IncraceExperiment -# передается эксперимент, (надо встроить эксперимент сюда) -# целевая картинка - передать данные и получить результат -# сейчас надо вшить эксперимент из ноутбука сюда -# """ -# result[f"effect {e.label}"] = e.calc_effect( -# data.loc[split["test"]], data.loc[split["control"]], self.target_field -# ) -# -# return result -# -# def format_stat( -# self, stat: pd.DataFrame, experiments = None, rename_map: Dict = None, -# ): -# """Corrects format of output statistics. -# -# Args: -# stat: Experiment statistics -# experiments: Set of experiments applied on sample -# rename_map: Mapping of renaming fields -# -# Returns: -# result: Formatted values -# """ -# rename_map = rename_map or self.DEFAULT_FORMAT_MAPPING -# -# rename_map.update({f"effect {e.label}": f"Effect {e.label}" for e in experiments}) -# -# result = stat.rename(columns=rename_map) -# result = result.applymap(lambda x: f"{x:,.2f}") -# return result +class ABTest: + def __init__( + self, + calc_difference_method: str = "all", + calc_p_value_method: str = "all", + ): + """ + Initializes the ABTest class. + Parameters: + calc_difference_method (str, optional): The method used to calculate the difference. Defaults to 'all'. + calc_p_value_method (str, optional): The method used to calculate the p-value. Defaults to 'all'. + """ + self.calc_difference_method = calc_difference_method + self.calc_p_value_method = calc_p_value_method + + def split_ab(self, data: pd.DataFrame, group_field: str) -> Dict[str, pd.DataFrame]: + """ + Splits a pandas DataFrame into two separate dataframes based on a specified group field. + + Parameters: + data (pd.DataFrame): The input dataframe to be split. + group_field (str): The column name representing the group field. + + Returns: + dict: A dictionary containing two dataframes, 'test' and 'control', where 'test' contains rows where the group field is 'test', and 'control' contains rows where the group field is 'control'. + """ + return { + "test": data[data[group_field] == "test"], + "control": data[data[group_field] == "control"], + } + + def calc_difference( + self, splitted_data: Dict[str, pd.DataFrame], target_field: str + ) -> Dict[str, float]: + """ + Calculates the difference between the target field values of the 'test' and 'control' dataframes. + + Parameters: + splitted_data (Dict[str, pd.DataFrame]): A dictionary containing the 'test' and 'control' dataframes. + target_field (str): The name of the target field. + + Returns: + result (Dict[str, float]): A dictionary containing the difference between the target field values of the 'test' and 'control' dataframes. + """ + result = {} + if self.calc_difference_method in {"all", "ate"}: + result["ate"] = ( + splitted_data["test"][target_field] + - splitted_data["control"][target_field] + ).mean() + return result + + def calc_p_value( + self, splitted_data: Dict[str, pd.DataFrame], target_field: str + ) -> Dict[str, float]: + """ + Calculates the p-value for a given data set. + + Args: + splitted_data (Dict[str, pd.DataFrame]): A dictionary containing the split data, where the keys are 'test' and 'control' and the values are pandas DataFrames. + target_field (str): The name of the target field. + Returns: + Dict[str, float]: A dictionary containing the calculated p-values, where the keys are 't_test' and 'mann_whitney' and the values are the corresponding p-values. + """ + result = {} + if self.calc_p_value_method in {"all", "t_test"}: + result["t_test"] = ttest_ind( + splitted_data["test"][target_field], + splitted_data["control"][target_field], + ).pvalue + if self.calc_p_value_method in {"all", "mann_whitney"}: + result["mann_whitney"] = mannwhitneyu( + splitted_data["test"][target_field], + splitted_data["control"][target_field], + ).pvalue + return result + + def execute( + self, data: pd.DataFrame, target_field: str, group_field: str + ) -> Dict[str, Dict[str, float]]: + """ + Executes the function by splitting the input data based on the group field and calculating the size, difference, and p-value. + + Parameters: + data (pd.DataFrame): The input data as a pandas DataFrame. + target_field (str): The target field to be analyzed. + group_field (str): The field used to split the data into groups. + + Returns: + Dict[str, Dict[str, float]]: A dictionary containing the size, difference, and p-value of the split data. + - 'size': A dictionary with the sizes of the test and control groups. + - 'difference': A dictionary with the calculated differences between the groups. + - 'p_value': A dictionary with the calculated p-values for each group. + """ + splitted_data = self.split_ab(data, group_field) + return { + "size": { + "test": len(splitted_data["test"]), + "control": len(splitted_data["control"]), + }, + "difference": self.calc_difference(splitted_data, target_field), + "p_value": self.calc_p_value(splitted_data, target_field), + } diff --git a/lightautoml/addons/hypex/ab_spliter.py b/lightautoml/addons/hypex/ab_spliter.py deleted file mode 100644 index 88e4c631..00000000 --- a/lightautoml/addons/hypex/ab_spliter.py +++ /dev/null @@ -1,474 +0,0 @@ -# v: 0.2.1a -import warnings -from abc import ABC, abstractmethod -from pathlib import Path -from sklearn.utils import shuffle -from typing import Iterable, Union, Optional, List, Dict - -from tqdm.auto import tqdm - -import pandas as pd -import numpy as np -import scipy.stats as stats -from scipy.stats import norm, ttest_ind, ks_2samp - - -# Методика рачета описана в Campaign Perfomance Management – Методика A/B-тестирования v.2.0 (стр.29) -def calc_mde(test_group: pd.Series, control_group: pd.Series, reliability: float = 0.95, power: float = 0.8,) -> float: - """Calculates MDE (Minimum Detectable Effect). - - MDE - minimal effect that can be statistically substantiated comparing the two groups - Calculation method is described in "Campaign Performance Management" - A/B-testing methodology v.2.0 (p.29) - - Args: - test_group: Target group - control_group: Control group - reliability: Level of statistical reliability, usually equals 0.95 - power: Statistical criterion power - - Returns: - mde: Minimum detectable effect - """ - m = stats.norm.ppf((1 + reliability) / 2) + stats.norm.ppf(power) - - n_test, n_control = len(test_group), len(control_group) - proportion = n_test / (n_test + n_control) - p = np.sqrt(1 / (proportion * (1 - proportion))) - - var_test, var_control = test_group.var(ddof=1), control_group.var(ddof=1) - s = np.sqrt((var_test / n_test) + (var_control / n_control)) - - mde = m * p * s - - return mde - - -def calc_sample_size( - test_group: pd.Series, control_group: pd.Series, mde: Union[Iterable[float], float], significance: float = 0.05, power: float = 0.8, -) -> float: - """Calculates minimal required number of test objects for test in the general group. - - Calculation method is described in "Campaign Performance Management" - A/B-testing methodology v.2.0 (p.14) - - Args: - test_group: Target group - control_group: Control group - mde: Minimal detectable effect - significance: Statistical significance level - type I error probability (usually 0.05) - power: Statistical criterion power - - Returns: - min_sample_size: Minimal size of the general group - """ - if isinstance(mde, Iterable): - z_alpha = norm.ppf((2 - significance) / 2) - z_betta = norm.ppf(power) - - p1 = mde[0] - p2 = mde[1] - - min_sample_size = (z_alpha + z_betta) ** 2 * (p1 * (1 - p1) + p2 * (1 - p2)) / (p1 - p2) ** 2 - else: - test_std = test_group.std() - control_std = control_group.std() - - test_proportion = len(test_group) / (len(test_group) + len(control_group)) - control_proportion = 1 - test_proportion - - d = ((norm.ppf(1 - significance / 2) + norm.ppf(power)) / mde) ** 2 - - s = test_std ** 2 / test_proportion + control_std ** 2 / control_proportion - - min_sample_size = d * s - - return min_sample_size - - -# --------------------- Classes --------------------- -class ABSplitter: - """Abstract class - divider on A and B groups.""" - - def __init__(self, mode: str = "simple", by_group: Union[str, Iterable[str]] = None, quant_field: str = None): - """ - - Args: - mode: - Regime to divide sample on A and B samples: - 'simple' - divides by groups, placing equal amount of records (clients | groups of clients) in samples - 'balanced' - divides by size of samples, placing full groups depending on the size of group to balance size of A and B. Can not be applied without groups - by_group: - Name of field(s) for division by groups - quant_field: - Name of field by which division should take in account common features besides groups - """ - self.mode = mode - self.by_group = by_group - self.quant_field = quant_field - - @staticmethod - def merge_groups( - test_group: Union[Iterable[pd.DataFrame], pd.DataFrame], - control_group: Union[Iterable[pd.DataFrame], pd.DataFrame], - ): - """Merges test and control groups in one DataFrame. - - Args: - test_group: Data of target group - control_group: Data of control group - - Returns: - merged_data: Concatted DataFrame - """ - if not (isinstance(test_group, pd.DataFrame) and isinstance(test_group, pd.DataFrame)): - test_group = pd.concat(test_group, ignore_index=True) - control_group = pd.concat(control_group, ignore_index=True) - - test_group.loc[:, "group"] = "test" - control_group.loc[:, "group"] = "control" - - merged_data = pd.concat([test_group, control_group], ignore_index=True) - - return merged_data - - def __simple_mode(self, data: pd.DataFrame, random_state: int = None): - """Separates data on A and B samples. - - Separation performed to divide groups of equal sizes - equal amount of records or equal amount of groups in each sample - - Args: - data: Input data - random_state: Seed of random - - Returns: - result: Test and control samples of indexes dictionary - """ - result = {"test_indexes": [], "control_indexes": []} - - if self.quant_field: - random_ids = shuffle(data[self.quant_field].unique(), random_state=random_state) - edge = len(random_ids) // 2 - result["test_indexes"] = list(data[data[self.quant_field].isin(random_ids[:edge])].index) - result["control_indexes"] = list(data[data[self.quant_field].isin(random_ids[edge:])].index) - - else: - addition_indexes = list(shuffle(data.index, random_state=random_state)) - edge = len(addition_indexes) // 2 - result["test_indexes"] = addition_indexes[:edge] - result["control_indexes"] = addition_indexes[edge:] - - return result - - def split_ab(self, data: pd.DataFrame, random_state: int = None) -> Dict: - """Divides sample on two groups. - - Args: - data: input data - random_state: one integer to fix split - - Returns: - result: dict of indexes with division on test and control group - """ - result = {"test_indexes": [], "control_indexes": []} - - if self.by_group: - groups = data.groupby() - for _, gd in groups: - if self.mode not in ("balanced", "simple"): - warnings.warn( - f"The mode '{self.mode}' is not supported for group division. " - f"Implemented mode 'simple'." - ) - self.mode = "simple" - - if self.mode == "simple": - t_result = self.__simple_mode(gd, random_state) - result["test_indexes"] += t_result["test_indexes"] - result["control_indexes"] += t_result["control_indexes"] - - elif self.mode == "balanced": - if self.quant_field: - random_ids = shuffle(gd[self.quant_field].unique(), random_state=random_state) - addition_indexes = list(gd[gd[self.quant_field].isin(random_ids)].index) - else: - addition_indexes = list(shuffle(gd.index, random_state=random_state)) - - if len(result["control_indexes"]) > len(result["test_indexes"]): - result["test_indexes"] += addition_indexes - else: - result["control_indexes"] += addition_indexes - - else: - if self.mode != "simple": - warnings.warn( - f"The mode '{self.mode}' is not supported for regular division. " - f"Implemented mode 'simple'." - ) - - t_result = self.__simple_mode(data, random_state) - result["test_indexes"] = t_result["test_indexes"] - result["control_indexes"] = t_result["control_indexes"] - - result["test_indexes"] = list(set(result["test_indexes"])) - result["control_indexes"] = list(set(result["test_indexes"])) - - return result - - def search_dist_uniform_sampling( - self, - data: pd.DataFrame, - target_fields: Union[List[str], str], - n: int = None, - random_states: Iterable[int] = None, - alpha: float = 0.05, - file_name: Union[Path, str] = None, - write_mode: str = "full", - write_step: int = 10, - pbar: bool = True, - ) -> Optional[pd.DataFrame]: - """Chooses random_state for finding homogeneous distribution. - - Args: - data: - Input data - target_fields: - Field with target value - n: - Number of searching iterations - random_states: - Random states from searching (if given, n is ignoring) - alpha: - Threshold to check statistical hypothesis; usually 0.05 - file_name: - Name of file to save results (if None - no results will be saved, func returns result) - write_mode: - Mode to write: - 'full' - save all experiments - 'all' - save experiments that passed all statistical tests - 'any' - save experiments that passed any statistical test - write_step: - Step to write experiments to file - pbar: - Flag to show progress bar - - Returns: - results: - If no saving (no file_name, no write mode and no write_step) returns dataframe - else None and saves file to csv - """ - if random_states is None and n: - random_states = range(n) - - results = [] - - if write_mode not in ("full", "all", "any"): - warnings.warn( - f"Write mode '{write_mode}' is not supported. Mode 'full' will be used" - ) - write_mode = "full" - - if isinstance(target_fields, str): - target_fields = [target_fields] - - for i, random_state in tqdm(enumerate(random_states), total=len(random_states), display=pbar): - split = self.split_ab(data, random_state) - t_result = {"random_state": random_state} - a = data.loc[split["test_indexes"]] - b = data.loc[split["control_indexes"]] - scores = [] - passed = [] - - for tf in target_fields: - ta = a[tf] - tb = b[tf] - - t_result[f"{tf} a mean"] = ta.mean() - t_result[f"{tf} b mean"] = tb.mean() - t_result[f"{tf} ab mran delta %"] = (1 - t_result[f"{tf} a mean"] / t_result[f"{tf} b mean"]) * 100 - t_result[f"{tf} t_test p_value"] = ttest_ind(ta, tb).pvalue - t_result[f"{tf} ks_test p_value"] = ks_2samp(ta, tb).pvalue - t_result[f"{tf} t_test passed"] = t_result[f"{tf} t_test p_value"] > alpha - t_result[f"{tf} ks_test passed"] = t_result[f"{tf} ks_test p_value"] > alpha - scores.append((t_result[f"{tf} t_test p_value"] + t_result[f"{tf} ks_test p_value"]) / 2) - passed += [t_result[f"{tf} t_test passed"], t_result[f"{tf} ks_test passed"]] - - t_result["score"] = np.mean(scores) - - if write_mode == "all" and all(passed): - results.append(t_result) - if write_mode == "any" and any(passed): - results.append(t_result) - if write_mode == "full": - results.append(t_result) - - if file_name and write_step: - if i == write_step: - pd.DataFrame(results).to_csv(file_name, index=False) - elif i % write_step == 0: - pd.DataFrame(results).to_csv(file_name, index=False, header=False, mode="a") - results = [] - - if file_name and write_step: - pd.DataFrame(results).to_csv(file_name, index=False, header=False, mode="a") - elif file_name: - results = pd.DataFrame(results) - results.to_csv(file_name, index=False) - return results - else: - return pd.DataFrame(results) - - -class ABExperiment(ABC): - """Abstract class of A/B experiment.""" - - def __init__(self, label: str): - self.label = label - - @abstractmethod - def calc_effect(self, test_data: pd.DataFrame, control_data: pd.DataFrame, target_field: str) -> float: - pass - - -class ABTester: - """Separates data to homogeneous groups and calculate metrics - sizes of groups and MDE.""" - - DEFAULT_FORMAT_MAPPING = { - "rs": "random state", - "mde": "MDE", - "sample_size": "Size of test sample", - "a_len": "Size of target group", - "b_len": "Size of control group", - "a_mean": "Mean of target group", - "b_mean": "Mean of control group", - } - - def __init__( - self, splitter: ABSplitter, target_field: str, reliability: float = 0.95, power: float = 0.8, mde: float = None, - ): - """ - Args: - splitter: - Class of divider on A and B groups - target_field: - Field with target values - reliability: - Level of statistical reliability, usually equals 0.95 - power: - Statistical criterion power, usually equals 0.8 - mde: - Calculated mde (minimal detected effect), - if none - calculates inside - """ - self.splitter = splitter - self.target_field = target_field - self.reliability = reliability - self.power = power - self.mde = mde - - def sampling_test( - self, data: pd.DataFrame, experiments: Union[ABExperiment, Iterable[ABExperiment]], random_state: int = None, - ) -> Dict: - """Test on specific sample. - - Args: - data: Input data - experiments: Experiment or set of experiments applied on sample - random_state: Seed of random - - Returns: - result: Test results - """ - - split = self.splitter.split_ab(data, random_state) - if isinstance(experiments, ABExperiment): - experiments = [experiments] - - mde = self.mde or calc_mde( - data.loc[split["test"], self.target_field], - data.loc[split["control"], self.target_field], - reliability=self.reliability, - power=self.power, - ) - sample_size = calc_sample_size( - data.loc[split["test"], self.target_field], - data.loc[split["control"], self.target_field], - mde, - significance=(1 - self.reliability), - power=self.power, - ) - - result = { - "rs": random_state, - "mde": mde, - "sample_size": sample_size, - "a_len": len(split["test"]), - "b_len": len(split["control"]), - "a_mean": data.loc[split["test"], self.target_field].mean(), - "b_mean": data.loc[split["control"], self.target_field].mean(), - } - - for e in experiments: # как считается эффект написано в эксперименте, перенести в calc_effect - """ - сделать разницу средних в наследнике класса (новый класс создать) - на альфе в к7м ABTesting, IncraceExperiment - передается эксперимент, (надо встроить эксперимент сюда) - целевая картинка - передать данные и получить результат - сейчас надо вшить эксперимент из ноутбука сюда - """ - result[f"effect {e.label}"] = e.calc_effect( - data.loc[split["test"]], data.loc[split["control"]], self.target_field - ) - - return result - - def multisampling_test( - self, - data: pd.DataFrame, - experiments: Union[ABExperiment, Iterable[ABExperiment]], - random_states: Iterable[int], - pbar: bool = False, - ) -> tuple[pd.DataFrame, pd.DataFrame]: - """Implements multiple experiments on random states. - - Args: - data: - Input data - experiments: - Set of experiments applied on sample - random_states: - Seeds of random - pbar: - Flag to show progress bar - - Returns: - results: - Experiment test results - statistics: - Description statistics - """ - - results = pd.DataFrame([self.sampling_test(data, experiments, rs) for rs in tqdm(random_states, display=pbar)]) - - statistics = results.describe() - statistics.loc["cv %"] = (statistics.loc["std"] / statistics.loc["mean"] * 100).round(2) - return results, statistics - - def format_stat( - self, stat: pd.DataFrame, experiments: Union[ABExperiment, Iterable[ABExperiment]], rename_map: Dict = None, - ): - """Corrects format of output statistics. - - Args: - stat: Experiment statistics - experiments: Set of experiments applied on sample - rename_map: Mapping of renaming fields - - Returns: - result: Formatted values - """ - rename_map = rename_map or self.DEFAULT_FORMAT_MAPPING - - rename_map.update({f"effect {e.label}": f"Effect {e.label}" for e in experiments}) - - result = stat.rename(columns=rename_map) - result = result.applymap(lambda x: f"{x:,.2f}") - return result diff --git a/lightautoml/addons/hypex/tests/tests_abtester.py b/lightautoml/addons/hypex/tests/test_aa_test.py similarity index 99% rename from lightautoml/addons/hypex/tests/tests_abtester.py rename to lightautoml/addons/hypex/tests/test_aa_test.py index e8dd5d46..f00c3d9d 100644 --- a/lightautoml/addons/hypex/tests/tests_abtester.py +++ b/lightautoml/addons/hypex/tests/test_aa_test.py @@ -76,3 +76,4 @@ def test_aa_quantfields(): assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \ "the same as columns in initial " \ "data " + diff --git a/lightautoml/addons/hypex/tests/tests_ab_test.py b/lightautoml/addons/hypex/tests/tests_ab_test.py new file mode 100644 index 00000000..766631db --- /dev/null +++ b/lightautoml/addons/hypex/tests/tests_ab_test.py @@ -0,0 +1,58 @@ +import pytest +import pandas as pd +from lightautoml.addons.hypex.ABTesting.ab_tester import ABTest + + +@pytest.fixture +def ab_test(): + return ABTest() + + +@pytest.fixture +def data(): + return pd.DataFrame( + {"group": ["test", "test", "control", "control"], "value": [1, 2, 3, 4]} + ) + + +@pytest.fixture +def target_field(): + return "value" + + +@pytest.fixture +def group_field(): + return "group" + + +def test_split_ab(ab_test, data, group_field): + expected_result = { + "test": pd.DataFrame({"group": ["test", "test"], "value": [1, 2]}), + "control": pd.DataFrame({"group": ["control", "control"], "value": [3, 4]}), + } + result = ab_test.split_ab(data, group_field) + assert result == expected_result + + +def test_calc_difference(ab_test, data, group_field, target_field): + splitted_data = ab_test.split_ab(data, group_field) + expected_result = {"ate": -1.0} + result = ab_test.calc_difference(splitted_data, target_field) + assert result == expected_result + + +def test_calc_p_value(ab_test, data, group_field, target_field): + splitted_data = ab_test.split_ab(data, group_field) + expected_result = {"t_test": 0.5714285714285714, "mann_whitney": 0.3333333333333333} + result = ab_test.calc_p_value(splitted_data, target_field) + assert result == expected_result + + +def test_execute(ab_test, data, group_field, target_field): + expected_result = { + "size": {"test": 2, "control": 2}, + "difference": {"ate": -1.0}, + "p_value": {"t_test": 0.5714285714285714, "mann_whitney": 0.3333333333333333}, + } + result = ab_test.execute(data, target_field, group_field) + assert result == expected_result From 6aab11b4f16ed3b9e2bada788d9ae0b62ef83ae3 Mon Sep 17 00:00:00 2001 From: 20810012 Date: Wed, 27 Sep 2023 13:03:28 +0300 Subject: [PATCH 15/20] metrics diff_in_diff and cuped added --- .../addons/hypex/ABTesting/ab_tester.py | 180 ++++++++++++++++-- .../tests/{test_aa_test.py => test_aa.py} | 5 +- .../tests/{tests_ab_test.py => test_ab.py} | 0 3 files changed, 161 insertions(+), 24 deletions(-) rename lightautoml/addons/hypex/tests/{test_aa_test.py => test_aa.py} (93%) rename lightautoml/addons/hypex/tests/{tests_ab_test.py => test_ab.py} (100%) diff --git a/lightautoml/addons/hypex/ABTesting/ab_tester.py b/lightautoml/addons/hypex/ABTesting/ab_tester.py index e76aac08..7aa3bc04 100644 --- a/lightautoml/addons/hypex/ABTesting/ab_tester.py +++ b/lightautoml/addons/hypex/ABTesting/ab_tester.py @@ -1,4 +1,5 @@ import warnings +from collections import namedtuple from abc import ABC, abstractmethod from pathlib import Path from sklearn.utils import shuffle @@ -8,11 +9,19 @@ import pandas as pd import numpy as np -import scipy.stats as stats -from scipy.stats import norm, ttest_ind, ks_2samp, mannwhitneyu +from scipy.stats import ttest_ind, ks_2samp, mannwhitneyu RANDOM_STATE = 52 +ExperimentComparisonResults_cuped = namedtuple( + 'CUPED', + ['pvalue', 'effect', 'ci_length', 'left_bound', 'right_bound'] +) +ExperimentComparisonResults_cuped_abs = namedtuple( + 'CUPED_abs', + ['pvalue', 'effect', 'ci_length', 'left_bound', 'right_bound'] +) + def merge_groups( test_group: Union[Iterable[pd.DataFrame], pd.DataFrame], control_group: Union[Iterable[pd.DataFrame], pd.DataFrame], @@ -41,6 +50,97 @@ def merge_groups( return merged_data +def cuped( + test_data: pd.DataFrame, + control_data: pd.DataFrame, + target_field: str, + target_field_before: str +): + """Counts CUPED (Controlled-Experiment using Pre-Experiment Data) in absolute values. + + Metric uses pre-post analysis of target, uses to minimize variance of effect: + ATE = mean(test_cuped) - mean(control_cuped) + , where + test_cuped = target__test - theta * target_before__test + control_cuped = target__control - theta * target_before__control + , where + theta = (cov_test + cov_control) / (var_test + var_control) + , where + cov_test = cov(target__test, target_before__test) + cov_control = cov(target__control, target_before__control) + var_test = var(target_before__test) + var_control = var(target_before__control) + + Args: + test_data: input data of test group + Should include target before and after pilot + control_data: input data of control group + Should include target before and after pilot + target_field: column name of target after pilot + target_field_before: column name of target before pilot + + Returns: + result: named tuple with pvalue, effect, ci_length, left_bound and right_bound + """ + control = control_data[target_field] + control_before = control_data[target_field_before] + test = test_data[target_field] + test_before = test_data[target_field_before] + + theta = (np.cov(control, control_before)[0, 1] + np.cov(test, test_before)[0, 1]) \ + / (np.var(control_before) + np.var(test_before)) + + control = control - theta * control_before + test = test - theta * test_before + + mean_control = np.mean(control) + mean_test = np.mean(test) + var_mean_control = np.var(control) / len(control) + var_mean_test = np.var(test) / len(test) + + difference_mean = mean_test - mean_control + difference_mean_var = var_mean_control + var_mean_test + difference_distribution = ss.norm(loc=difference_mean, scale=np.sqrt(difference_mean_var)) + + left_bound, right_bound = difference_distribution.ppf([0.025, 0.975]) + ci_length = (right_bound - left_bound) + pvalue = 2 * min(difference_distribution.cdf(0), difference_distribution.sf(0)) + effect = difference_mean + result = ExperimentComparisonResults_cuped(pvalue, effect, ci_length, left_bound, right_bound) + + return result + + +def diff_in_diff( + test_data: pd.DataFrame, + control_data: pd.DataFrame, + target_field: str, + target_field_before: str +): + """Counts Difference in Difference. + + Metric uses pre-post analysis and counts difference in means in data before and after pilot: + ATE = (y_test_after - y_control_after) - (y_test_before - y_control_before) + + Args: + test_data: input data of test group + control_data: input data of control group + target_field: column name of target after pilot + target_field_before: column name of target before pilot + + Returns: + did: value of difference in difference + """ + mean_test = np.mean(test_data[target_field]) + mean_control = np.mean(control_data[target_field]) + + mean_test_before = np.mean(test_data[target_field_before]) + mean_control_before = np.mean(control_data[target_field_before]) + did = (mean_test - mean_control) - (mean_test_before - mean_control_before) + + return did + + class AATest: def __init__( self, @@ -140,7 +240,7 @@ def __simple_mode(self, data: pd.DataFrame, random_state: int = RANDOM_STATE): return result - def split_ab(self, random_state: int = RANDOM_STATE) -> Dict: + def split(self, random_state: int = RANDOM_STATE) -> Dict: """Divides sample on two groups. Args: @@ -225,7 +325,7 @@ def sampling_metrics(self, alpha: float = 0.05, random_state: int = RANDOM_STATE scores = [] t_result = {"random_state": random_state} - split = self.split_ab(random_state) + split = self.split(random_state) a = self.data.loc[split["test_indexes"]] b = self.data.loc[split["control_indexes"]] @@ -291,7 +391,7 @@ def search_dist_uniform_sampling( warnings.warn(f"Write mode '{write_mode}' is not supported. Mode 'full' will be used") write_mode = "full" - for i, rs in tqdm(enumerate(random_states), total=len(random_states)):#, display=pbar): + for i, rs in tqdm(enumerate(random_states), total=len(random_states), disable=not pbar): res = self.sampling_metrics(alpha=alpha, random_state=rs) data_from_sampling.update(res["data_from_experiment"]) @@ -330,50 +430,88 @@ def __init__( calc_difference_method: str = "all", calc_p_value_method: str = "all", ): - """ - Initializes the ABTest class. - Parameters: - calc_difference_method (str, optional): The method used to calculate the difference. Defaults to 'all'. - calc_p_value_method (str, optional): The method used to calculate the p-value. Defaults to 'all'. + """Initializes the ABTest class. + + Args: + calc_difference_method: + The method used to calculate the difference: + 'all' [default] - all metrics + 'ate' - basic difference in means of targets in test and control group + 'diff_in_diff' - difference in difference value, + performs pre-post analysis (required values of target before pilot) + 'cuped' - Controlled-Experiment using Pre-Experiment Data value, + performs pre-post analysis (required values of target before pilot) + calc_p_value_method: + The method used to calculate the p-value. Defaults to 'all'. """ self.calc_difference_method = calc_difference_method self.calc_p_value_method = calc_p_value_method - def split_ab(self, data: pd.DataFrame, group_field: str) -> Dict[str, pd.DataFrame]: + @staticmethod + def split_ab(data: pd.DataFrame, group_field: str) -> Dict[str, pd.DataFrame]: """ Splits a pandas DataFrame into two separate dataframes based on a specified group field. Parameters: - data (pd.DataFrame): The input dataframe to be split. - group_field (str): The column name representing the group field. + data: The input dataframe to be split. + group_field: The column name representing the group field. Returns: - dict: A dictionary containing two dataframes, 'test' and 'control', where 'test' contains rows where the group field is 'test', and 'control' contains rows where the group field is 'control'. + splitted_data: + A dictionary containing two dataframes, 'test' and 'control', where 'test' contains rows where the + group field is 'test', and 'control' contains rows where the group field is 'control'. """ - return { + splitted_data = { "test": data[data[group_field] == "test"], "control": data[data[group_field] == "control"], } + return splitted_data + def calc_difference( - self, splitted_data: Dict[str, pd.DataFrame], target_field: str + self, + splitted_data: Dict[str, pd.DataFrame], + target_field: str, + target_field_before: str = None ) -> Dict[str, float]: """ Calculates the difference between the target field values of the 'test' and 'control' dataframes. Parameters: - splitted_data (Dict[str, pd.DataFrame]): A dictionary containing the 'test' and 'control' dataframes. - target_field (str): The name of the target field. + splitted_data: A dictionary containing the 'test' and 'control' dataframes + target_field: The name of the target field contains data after pilot + target_field_before: The name of the target field contains data before pilot Returns: - result (Dict[str, float]): A dictionary containing the difference between the target field values of the 'test' and 'control' dataframes. + result: A dictionary containing the difference between the target field + values of the 'test' and 'control' dataframes. """ result = {} + if self.calc_difference_method in {"all", "diff_in_diff", "cuped"}: + if target_field_before is None: + raise ValueError( + "For calculation metrics 'cuped' or 'diff_in_diff' field 'target_field_before' is required" + ) if self.calc_difference_method in {"all", "ate"}: result["ate"] = ( - splitted_data["test"][target_field] - - splitted_data["control"][target_field] + splitted_data["test"][target_field].values + - splitted_data["control"][target_field].values ).mean() + if self.calc_difference_method in {"all", "cuped"}: + result['cuped'] = cuped( + test_data=splitted_data["test"], + control_data=splitted_data["control"], + target_field=target_field, + target_field_before=target_field_before + ) + if self.calc_difference_method in {"all", "diff_in_diff"}: + result['diff_in_diff'] = diff_in_diff( + test_data=splitted_data["test"], + control_data=splitted_data["control"], + target_field=target_field, + target_field_before=target_field_before + ) + return result def calc_p_value( diff --git a/lightautoml/addons/hypex/tests/test_aa_test.py b/lightautoml/addons/hypex/tests/test_aa.py similarity index 93% rename from lightautoml/addons/hypex/tests/test_aa_test.py rename to lightautoml/addons/hypex/tests/test_aa.py index f00c3d9d..d93b97e9 100644 --- a/lightautoml/addons/hypex/tests/test_aa_test.py +++ b/lightautoml/addons/hypex/tests/test_aa.py @@ -21,9 +21,8 @@ def test_aa_simple(): assert info_col not in model.data, "Info_col is take part in experiment, it should be deleted in preprocess" assert isinstance(datas_dict, dict), "Result is not dict" assert len(datas_dict) == iterations, "# of dataframes is not equal # of iterations" - assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), "Columns in the result are not " \ - "the same as columns in initial " \ - "data " + assert all(data.columns) == all(datas_dict[0].drop(columns=['group']).columns), \ + "Columns in the result are not the same as columns in initial data " def test_aa_group(): diff --git a/lightautoml/addons/hypex/tests/tests_ab_test.py b/lightautoml/addons/hypex/tests/test_ab.py similarity index 100% rename from lightautoml/addons/hypex/tests/tests_ab_test.py rename to lightautoml/addons/hypex/tests/test_ab.py From c9856ced9e2d479c77004a87dcdc4012943c8c4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=93=D0=BB=D0=B0=D0=B7=D0=BE=D0=B2=D0=B0=20=D0=AF=D0=BD?= =?UTF-8?q?=D0=B0=20=D0=90=D0=BB=D0=B5=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= =?UTF-8?q?=D0=BE=D0=B2=D0=BD=D0=B0?= Date: Wed, 27 Sep 2023 15:26:34 +0300 Subject: [PATCH 16/20] docstring fix --- lightautoml/addons/hypex/matcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lightautoml/addons/hypex/matcher.py b/lightautoml/addons/hypex/matcher.py index b0c7676c..10c19f8f 100644 --- a/lightautoml/addons/hypex/matcher.py +++ b/lightautoml/addons/hypex/matcher.py @@ -442,11 +442,11 @@ def validate_result( Validates estimated effect: 1) by replacing real treatment with random placebo treatment. - Estimated effect must be droped to zero, p-val < 0.05; + Estimated effect must be droped to zero, p-val > 0.05; 2) by adding random feature (`random_feature`). Estimated effect shouldn't change - significantly, p-val > 0.05; + significantly, p-val < 0.05; 3) estimates effect on subset of data (default fraction is 0.8). Estimated effect - shouldn't change significantly, p-val > 0.05. + shouldn't change significantly, p-val < 0.05. Args: refuter: From c190969bc2e9f79f15f3a7087fa81ca7ca558ec6 Mon Sep 17 00:00:00 2001 From: 20810012 Date: Wed, 27 Sep 2023 16:37:39 +0300 Subject: [PATCH 17/20] =?UTF-8?q?Tutorial=20finished,=20tests=20on=20AB=20?= =?UTF-8?q?is=20started=20-=20working,=20but=20we=20will=20add=20more=20te?= =?UTF-8?q?sts=20Bug=20-=20=D0=BA=D0=B2=D0=B0=D0=BD=D1=82=D0=B8=D0=B7?= =?UTF-8?q?=D0=B0=D1=86=D0=B8=D1=8F=20=D0=BD=D0=B5=20=D1=80=D0=B0=D0=B1?= =?UTF-8?q?=D0=BE=D1=82=D0=B0=D0=B5=D1=82=20-=20=D1=83=D0=B1=D1=80=D0=B0?= =?UTF-8?q?=D0=BB=D0=B8=20=D0=B5=D0=B5=20=D0=B8=D0=B7=20=D1=82=D1=83=D1=82?= =?UTF-8?q?=D0=BE=D1=80=D0=B8=D0=B0=D0=BB=D0=B0=20=D0=B8=20=D0=B2=20=D0=B1?= =?UTF-8?q?=D0=BB=D0=B8=D0=B6=D0=B0=D0=B9=D1=88=D0=B5=D0=B5=20=D0=B2=D1=80?= =?UTF-8?q?=D0=B5=D0=BC=D1=8F=20=D0=BF=D0=BE=D1=84=D0=B8=D0=BA=D1=81=D0=B8?= =?UTF-8?q?=D0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Tutorial_13_ABtesting.ipynb | 1671 +++++++---------- .../addons/hypex/ABTesting/ab_tester.py | 339 ++-- lightautoml/addons/hypex/tests/test_ab.py | 121 +- .../hypex/utils/tutorial_data_creation.py | 3 + 4 files changed, 939 insertions(+), 1195 deletions(-) diff --git a/Tutorial_13_ABtesting.ipynb b/Tutorial_13_ABtesting.ipynb index b7bf39a2..fb568b9e 100644 --- a/Tutorial_13_ABtesting.ipynb +++ b/Tutorial_13_ABtesting.ipynb @@ -2,34 +2,30 @@ "cells": [ { "cell_type": "markdown", - "id": "eaaddf5e", + "id": "e296125b", "metadata": {}, "source": [ - "1. Привести в туториале пример группировки и квантизации\n", - "3. AB-test сделать \n", - " - разобраться в пайплайне и сделать его\n", - " - вставить info_col\n", - " - добавить метрики cuped и did\n", - "2. написать тесты\n", - "3. как работать с пропусками в АА (ttest)\n", - "- название переменных поправить\n", - "\n", - "**Done**\n", - "- протестить один и несколько таргетов\n", - "- добавить info_col (iterable или str) в AAtest" + "1. Туториал AB-теста\n", + " - execute c all\n", + " - execute с ate\n", + " \n", + "2. Сделать функцию, которая краиво в датафреймах выводит метрики\n", + "3. Тесты сделать\n", + "4. " ] }, { "cell_type": "markdown", - "id": "0aa723ec", + "id": "64e2de80", "metadata": {}, "source": [ - "# How to perform AA or AB tests" + "# How to perform AA test\n", + "*AB-test is shown below*" ] }, { "cell_type": "markdown", - "id": "0d3be579", + "id": "9f52ff79", "metadata": {}, "source": [ "## 0. Import Libraries" @@ -37,22 +33,47 @@ }, { "cell_type": "code", - "execution_count": 100, - "id": "9af0ac46", + "execution_count": 1, + "id": "6c2c62f0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\20810012\\Desktop\\Задачи\\code\\matcher\\lightautoml\\ml_algo\\dl_model.py:41: UserWarning: 'transformers' - package isn't installed\n", + " warnings.warn(\"'transformers' - package isn't installed\")\n", + "C:\\Users\\20810012\\Desktop\\Задачи\\code\\matcher\\lightautoml\\text\\nn_model.py:22: UserWarning: 'transformers' - package isn't installed\n", + " warnings.warn(\"'transformers' - package isn't installed\")\n", + "C:\\Users\\20810012\\Desktop\\Задачи\\code\\matcher\\lightautoml\\text\\dl_transformers.py:25: UserWarning: 'transformers' - package isn't installed\n", + " warnings.warn(\"'transformers' - package isn't installed\")\n" + ] + } + ], "source": [ "import pandas as pd\n", "import numpy as np\n", - "from lightautoml.addons.hypex.ABTesting.ab_tester import AATest#, ABTest\n", + "from lightautoml.addons.hypex.ABTesting.ab_tester import AATest, ABTest\n", "from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data\n", "\n", + "pd.options.display.float_format = '{:,.2f}'.format\n", + "\n", "np.random.seed(52) #needed to create example data" ] }, { "cell_type": "markdown", - "id": "317f21f2", + "id": "2dca3eaa", "metadata": {}, "source": [ "## 1. Create or upload your dataset\n", @@ -62,15 +83,15 @@ }, { "cell_type": "code", - "execution_count": 101, - "id": "946d5c6a", + "execution_count": 2, + "id": "7b655d2d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "No NaN added\n" + "Length of na_step is less than length of columns. Used last value several times\n" ] }, { @@ -110,9 +131,9 @@ " 0\n", " 0\n", " 0\n", - " 488.0\n", - " 414.444444\n", - " 67\n", + " 488.00\n", + " 414.44\n", + " NaN\n", " M\n", " E-commerce\n", " \n", @@ -121,10 +142,10 @@ " 3\n", " 0\n", " 0\n", - " 501.5\n", - " 424.333333\n", - " 31\n", - " M\n", + " 501.50\n", + " 424.33\n", + " 31.00\n", + " NaN\n", " Logistics\n", " \n", " \n", @@ -132,9 +153,9 @@ " 10\n", " 0\n", " 0\n", - " 522.5\n", - " 416.222222\n", - " 64\n", + " 522.50\n", + " 416.22\n", + " 64.00\n", " M\n", " E-commerce\n", " \n", @@ -143,9 +164,9 @@ " 12\n", " 0\n", " 0\n", - " 472.0\n", - " 423.777778\n", - " 43\n", + " 472.00\n", + " 423.78\n", + " 43.00\n", " M\n", " E-commerce\n", " \n", @@ -154,9 +175,9 @@ " 13\n", " 0\n", " 0\n", - " 508.5\n", - " 424.222222\n", - " 36\n", + " 508.50\n", + " 424.22\n", + " 36.00\n", " F\n", " E-commerce\n", " \n", @@ -176,9 +197,9 @@ " 9991\n", " 0\n", " 0\n", - " 482.5\n", - " 421.888889\n", - " 23\n", + " 482.50\n", + " 421.89\n", + " 23.00\n", " F\n", " E-commerce\n", " \n", @@ -187,9 +208,9 @@ " 9992\n", " 0\n", " 0\n", - " 491.5\n", - " 424.000000\n", - " 44\n", + " 491.50\n", + " 424.00\n", + " 44.00\n", " M\n", " E-commerce\n", " \n", @@ -198,9 +219,9 @@ " 9994\n", " 0\n", " 0\n", - " 486.0\n", - " 423.777778\n", - " 27\n", + " 486.00\n", + " 423.78\n", + " 27.00\n", " F\n", " Logistics\n", " \n", @@ -209,9 +230,9 @@ " 9996\n", " 0\n", " 0\n", - " 500.5\n", - " 430.888889\n", - " 56\n", + " 500.50\n", + " 430.89\n", + " 56.00\n", " F\n", " E-commerce\n", " \n", @@ -220,9 +241,9 @@ " 9997\n", " 3\n", " 1\n", - " 473.0\n", - " 534.111111\n", - " 56\n", + " 473.00\n", + " 534.11\n", + " 56.00\n", " M\n", " Logistics\n", " \n", @@ -232,18 +253,18 @@ "" ], "text/plain": [ - " user_id signup_month treat pre_spends post_spends age gender \\\n", - "0 0 0 0 488.0 414.444444 67 M \n", - "1 3 0 0 501.5 424.333333 31 M \n", - "2 10 0 0 522.5 416.222222 64 M \n", - "3 12 0 0 472.0 423.777778 43 M \n", - "4 13 0 0 508.5 424.222222 36 F \n", - "... ... ... ... ... ... ... ... \n", - "5365 9991 0 0 482.5 421.888889 23 F \n", - "5366 9992 0 0 491.5 424.000000 44 M \n", - "5367 9994 0 0 486.0 423.777778 27 F \n", - "5368 9996 0 0 500.5 430.888889 56 F \n", - "5369 9997 3 1 473.0 534.111111 56 M \n", + " user_id signup_month treat pre_spends post_spends age gender \\\n", + "0 0 0 0 488.00 414.44 NaN M \n", + "1 3 0 0 501.50 424.33 31.00 NaN \n", + "2 10 0 0 522.50 416.22 64.00 M \n", + "3 12 0 0 472.00 423.78 43.00 M \n", + "4 13 0 0 508.50 424.22 36.00 F \n", + "... ... ... ... ... ... ... ... \n", + "5365 9991 0 0 482.50 421.89 23.00 F \n", + "5366 9992 0 0 491.50 424.00 44.00 M \n", + "5367 9994 0 0 486.00 423.78 27.00 F \n", + "5368 9996 0 0 500.50 430.89 56.00 F \n", + "5369 9997 3 1 473.00 534.11 56.00 M \n", "\n", " industry \n", "0 E-commerce \n", @@ -261,58 +282,58 @@ "[5370 rows x 8 columns]" ] }, - "execution_count": 101, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data = create_test_data(rs=52)\n", + "data = create_test_data(rs=52, na_step=10, nan_cols=['age', 'gender'])\n", "data" ] }, { "cell_type": "markdown", - "id": "4d1d05d4", + "id": "a0402e83", "metadata": {}, "source": [ - "## 2. AATest" + "## 2. AATest " ] }, { "cell_type": "markdown", - "id": "81aec791", + "id": "b3733f84", "metadata": {}, "source": [ "### 2.0 Initialize parameters\n", - "info_col used to define informative attributes that should NOT be part of testing, such as user_id and signup_month
" + "`info_col` used to define informative attributes that should NOT be part of testing, such as user_id and signup_month
" ] }, { "cell_type": "code", - "execution_count": 96, - "id": "b8156de6", + "execution_count": 3, + "id": "bc8e4ac0", "metadata": {}, "outputs": [], "source": [ "info_cols = ['user_id', 'signup_month']\n", - "target = 'post_spends'" + "target = ['post_spends', 'pre_spends']" ] }, { "cell_type": "markdown", - "id": "691d7549", + "id": "75c196ea", "metadata": {}, "source": [ "### 2.1 Simple AA-test\n", - "This is the easiest way to initialize and calculate metrics on a AA-test on 10 iterations
\n", + "This is the easiest way to initialize and calculate metrics on a AA-test (default - on 10 iterations)
\n", "Use it when you are clear about each attribute or if you don't have any additional task conditions (like grouping)" ] }, { "cell_type": "code", - "execution_count": 97, - "id": "d0677a8f", + "execution_count": 4, + "id": "701d20c0", "metadata": {}, "outputs": [], "source": [ @@ -321,14 +342,14 @@ }, { "cell_type": "code", - "execution_count": 98, - "id": "dc160d89", + "execution_count": 5, + "id": "560634e5", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0ea83b02b7174bbfb6eb8e54fd224b0d", + "model_id": "b3dbdfb331e644d783741a09861635f2", "version_major": 2, "version_minor": 0 }, @@ -338,7 +359,29 @@ }, "metadata": {}, "output_type": "display_data" - }, + } + ], + "source": [ + "experiment_result, dict_of_datas = experiment.search_dist_uniform_sampling(iterations=10)" + ] + }, + { + "cell_type": "markdown", + "id": "4c563d44", + "metadata": {}, + "source": [ + "`experiment_result` is a table of results of experiments, which includes \n", + "- means of all targets in a and b samples, \n", + "- p_values of Student t-test and test Kolmogorova-Smirnova, \n", + "- and results of tests (did data on the random_state passes the uniform test)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2adb8cdd", + "metadata": {}, + "outputs": [ { "data": { "text/html": [ @@ -368,6 +411,13 @@ " post_spends ks_test p_value\n", " post_spends t_test passed\n", " post_spends ks_test passed\n", + " pre_spends a mean\n", + " pre_spends b mean\n", + " pre_spends ab delta %\n", + " pre_spends t_test p_value\n", + " pre_spends ks_test p_value\n", + " pre_spends t_test passed\n", + " pre_spends ks_test passed\n", " mean_tests_score\n", " \n", " \n", @@ -375,38 +425,59 @@ " \n", " 0\n", " 0\n", - " 427.848003\n", - " 427.848003\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", + " 427.85\n", + " 427.85\n", + " 0.00\n", + " 1.00\n", + " 1.00\n", + " True\n", + " True\n", + " 484.63\n", + " 484.63\n", + " 0.00\n", + " 1.00\n", + " 1.00\n", " True\n", " True\n", - " 1.0\n", + " 1.00\n", " \n", " \n", " 1\n", " 1\n", - " 427.672046\n", - " 427.672046\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", + " 427.67\n", + " 427.67\n", + " 0.00\n", + " 1.00\n", + " 1.00\n", " True\n", " True\n", - " 1.0\n", + " 484.81\n", + " 484.81\n", + " 0.00\n", + " 1.00\n", + " 1.00\n", + " True\n", + " True\n", + " 1.00\n", " \n", " \n", " 2\n", " 2\n", - " 428.380095\n", - " 428.380095\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", + " 428.38\n", + " 428.38\n", + " 0.00\n", + " 1.00\n", + " 1.00\n", + " True\n", + " True\n", + " 484.76\n", + " 484.76\n", + " 0.00\n", + " 1.00\n", + " 1.00\n", " True\n", " True\n", - " 1.0\n", + " 1.00\n", " \n", " \n", "\n", @@ -414,40 +485,63 @@ ], "text/plain": [ " random_state post_spends a mean post_spends b mean \\\n", - "0 0 427.848003 427.848003 \n", - "1 1 427.672046 427.672046 \n", - "2 2 428.380095 428.380095 \n", + "0 0 427.85 427.85 \n", + "1 1 427.67 427.67 \n", + "2 2 428.38 428.38 \n", "\n", " post_spends ab delta % post_spends t_test p_value \\\n", - "0 0.0 1.0 \n", - "1 0.0 1.0 \n", - "2 0.0 1.0 \n", + "0 0.00 1.00 \n", + "1 0.00 1.00 \n", + "2 0.00 1.00 \n", "\n", " post_spends ks_test p_value post_spends t_test passed \\\n", - "0 1.0 True \n", - "1 1.0 True \n", - "2 1.0 True \n", + "0 1.00 True \n", + "1 1.00 True \n", + "2 1.00 True \n", "\n", - " post_spends ks_test passed mean_tests_score \n", - "0 True 1.0 \n", - "1 True 1.0 \n", - "2 True 1.0 " + " post_spends ks_test passed pre_spends a mean pre_spends b mean \\\n", + "0 True 484.63 484.63 \n", + "1 True 484.81 484.81 \n", + "2 True 484.76 484.76 \n", + "\n", + " pre_spends ab delta % pre_spends t_test p_value \\\n", + "0 0.00 1.00 \n", + "1 0.00 1.00 \n", + "2 0.00 1.00 \n", + "\n", + " pre_spends ks_test p_value pre_spends t_test passed \\\n", + "0 1.00 True \n", + "1 1.00 True \n", + "2 1.00 True \n", + "\n", + " pre_spends ks_test passed mean_tests_score \n", + "0 True 1.00 \n", + "1 True 1.00 \n", + "2 True 1.00 " ] }, - "execution_count": 98, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "experiment_result, dict_of_datas = experiment.search_dist_uniform_sampling()\n", "experiment_result.head(3)" ] }, + { + "cell_type": "markdown", + "id": "e40adf5e", + "metadata": {}, + "source": [ + "`dict_of_datas` is a dictionary with random_states as keys and dataframes as values.
\n", + "Result of separation can be find in column 'group', it contains values 'test' and 'control'" + ] + }, { "cell_type": "code", - "execution_count": 99, - "id": "d57d3639", + "execution_count": 7, + "id": "cac4e650", "metadata": { "scrolled": false }, @@ -490,10 +584,10 @@ " 3\n", " 0\n", " 0\n", - " 501.5\n", - " 424.333333\n", - " 31\n", - " M\n", + " 501.50\n", + " 424.33\n", + " 31.00\n", + " NaN\n", " Logistics\n", " test\n", " \n", @@ -502,9 +596,9 @@ " 10\n", " 0\n", " 0\n", - " 522.5\n", - " 416.222222\n", - " 64\n", + " 522.50\n", + " 416.22\n", + " 64.00\n", " M\n", " E-commerce\n", " test\n", @@ -514,197 +608,88 @@ " 12\n", " 0\n", " 0\n", - " 472.0\n", - " 423.777778\n", - " 43\n", + " 472.00\n", + " 423.78\n", + " 43.00\n", " M\n", " E-commerce\n", " test\n", " \n", - " \n", - " 3\n", - " 13\n", - " 0\n", - " 0\n", - " 508.5\n", - " 424.222222\n", - " 36\n", - " F\n", - " E-commerce\n", - " test\n", - " \n", - " \n", - " 4\n", - " 14\n", - " 0\n", - " 0\n", - " 497.0\n", - " 421.777778\n", - " 26\n", - " M\n", - " Logistics\n", - " test\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 5365\n", - " 9987\n", - " 0\n", - " 0\n", - " 467.0\n", - " 431.555556\n", - " 48\n", - " M\n", - " Logistics\n", - " control\n", - " \n", - " \n", - " 5366\n", - " 9988\n", - " 0\n", - " 0\n", - " 501.5\n", - " 423.222222\n", - " 55\n", - " F\n", - " Logistics\n", - " control\n", - " \n", - " \n", - " 5367\n", - " 9991\n", - " 0\n", - " 0\n", - " 482.5\n", - " 421.888889\n", - " 23\n", - " F\n", - " E-commerce\n", - " control\n", - " \n", - " \n", - " 5368\n", - " 9992\n", - " 0\n", - " 0\n", - " 491.5\n", - " 424.000000\n", - " 44\n", - " M\n", - " E-commerce\n", - " control\n", - " \n", - " \n", - " 5369\n", - " 9996\n", - " 0\n", - " 0\n", - " 500.5\n", - " 430.888889\n", - " 56\n", - " F\n", - " E-commerce\n", - " control\n", - " \n", " \n", "\n", - "

5370 rows × 9 columns

\n", "" ], "text/plain": [ - " user_id signup_month treat pre_spends post_spends age gender \\\n", - "0 3 0 0 501.5 424.333333 31 M \n", - "1 10 0 0 522.5 416.222222 64 M \n", - "2 12 0 0 472.0 423.777778 43 M \n", - "3 13 0 0 508.5 424.222222 36 F \n", - "4 14 0 0 497.0 421.777778 26 M \n", - "... ... ... ... ... ... ... ... \n", - "5365 9987 0 0 467.0 431.555556 48 M \n", - "5366 9988 0 0 501.5 423.222222 55 F \n", - "5367 9991 0 0 482.5 421.888889 23 F \n", - "5368 9992 0 0 491.5 424.000000 44 M \n", - "5369 9996 0 0 500.5 430.888889 56 F \n", - "\n", - " industry group \n", - "0 Logistics test \n", - "1 E-commerce test \n", - "2 E-commerce test \n", - "3 E-commerce test \n", - "4 Logistics test \n", - "... ... ... \n", - "5365 Logistics control \n", - "5366 Logistics control \n", - "5367 E-commerce control \n", - "5368 E-commerce control \n", - "5369 E-commerce control \n", + " user_id signup_month treat pre_spends post_spends age gender \\\n", + "0 3 0 0 501.50 424.33 31.00 NaN \n", + "1 10 0 0 522.50 416.22 64.00 M \n", + "2 12 0 0 472.00 423.78 43.00 M \n", "\n", - "[5370 rows x 9 columns]" + " industry group \n", + "0 Logistics test \n", + "1 E-commerce test \n", + "2 E-commerce test " ] }, - "execution_count": 99, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "dict_of_datas[0]" + "dict_of_datas[0].head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "c277b0b9", + "metadata": {}, + "source": [ + "#### - Single experiment\n", + "To get stable results lets fix `random_state`" ] }, { "cell_type": "code", - "execution_count": 64, - "id": "46aa41af", + "execution_count": 8, + "id": "01265e9e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "a\n" - ] - } - ], + "outputs": [], "source": [ - "if all(data.columns) == all(dict_of_datas[0].drop(columns=['group']).columns):\n", - " print('a')\n", - "else:\n", - " print('b')" + "random_state = 11" ] }, { "cell_type": "markdown", - "id": "fb387062", + "id": "62adf6e2", "metadata": {}, "source": [ - "#### 2.1.0 Single experiment\n", - "To perform single experiment you can use sampling_metrics()" + "To perform single experiment you can use `sampling_metrics()`" ] }, { "cell_type": "code", - "execution_count": 50, - "id": "ac3dc471", + "execution_count": 9, + "id": "ce5cf887", "metadata": {}, "outputs": [], "source": [ - "random_state = 11" + "experiment = AATest(data=data, info_cols=info_cols, target_fields=target)\n", + "metrics, dict_of_datas = experiment.sampling_metrics(random_state=random_state).values()" + ] + }, + { + "cell_type": "markdown", + "id": "128c5b68", + "metadata": {}, + "source": [ + "The results contains the same info as in multisampling, but on one experiment" ] }, { "cell_type": "code", - "execution_count": 51, - "id": "90c346c7", + "execution_count": 10, + "id": "bad5e42e", "metadata": { "scrolled": true }, @@ -720,24 +705,29 @@ " 'post_spends ks_test p_value': 1.0,\n", " 'post_spends t_test passed': True,\n", " 'post_spends ks_test passed': True,\n", + " 'pre_spends a mean': 484.9912476722533,\n", + " 'pre_spends b mean': 484.9912476722533,\n", + " 'pre_spends ab delta %': 0.0,\n", + " 'pre_spends t_test p_value': 1.0,\n", + " 'pre_spends ks_test p_value': 1.0,\n", + " 'pre_spends t_test passed': True,\n", + " 'pre_spends ks_test passed': True,\n", " 'mean_tests_score': 1.0}" ] }, - "execution_count": 51, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "res = experiment.sampling_metrics(random_state=random_state)\n", - "metrics, dict_of_datas = res['metrics'], res['data_from_experiment']\n", "metrics" ] }, { "cell_type": "code", - "execution_count": 52, - "id": "08ba416c", + "execution_count": 11, + "id": "a9c3c513", "metadata": {}, "outputs": [ { @@ -778,10 +768,10 @@ " 3\n", " 0\n", " 0\n", - " 501.5\n", - " 424.333333\n", - " 31\n", - " M\n", + " 501.50\n", + " 424.33\n", + " 31.00\n", + " NaN\n", " Logistics\n", " test\n", " \n", @@ -790,9 +780,9 @@ " 14\n", " 0\n", " 0\n", - " 497.0\n", - " 421.777778\n", - " 26\n", + " 497.00\n", + " 421.78\n", + " 26.00\n", " M\n", " Logistics\n", " test\n", @@ -802,9 +792,9 @@ " 21\n", " 0\n", " 0\n", - " 489.0\n", - " 433.111111\n", - " 30\n", + " 489.00\n", + " 433.11\n", + " 30.00\n", " M\n", " E-commerce\n", " test\n", @@ -814,10 +804,10 @@ " 28\n", " 3\n", " 1\n", - " 479.5\n", - " 527.888889\n", - " 20\n", - " F\n", + " 479.50\n", + " 527.89\n", + " 20.00\n", + " NaN\n", " E-commerce\n", " test\n", " \n", @@ -826,9 +816,9 @@ " 29\n", " 0\n", " 0\n", - " 505.0\n", - " 414.333333\n", - " 30\n", + " 505.00\n", + " 414.33\n", + " 30.00\n", " M\n", " E-commerce\n", " test\n", @@ -850,9 +840,9 @@ " 9988\n", " 0\n", " 0\n", - " 501.5\n", - " 423.222222\n", - " 55\n", + " 501.50\n", + " 423.22\n", + " 55.00\n", " F\n", " Logistics\n", " control\n", @@ -862,9 +852,9 @@ " 9990\n", " 0\n", " 0\n", - " 490.0\n", - " 426.000000\n", - " 18\n", + " 490.00\n", + " 426.00\n", + " 18.00\n", " M\n", " E-commerce\n", " control\n", @@ -874,9 +864,9 @@ " 9991\n", " 0\n", " 0\n", - " 482.5\n", - " 421.888889\n", - " 23\n", + " 482.50\n", + " 421.89\n", + " 23.00\n", " F\n", " E-commerce\n", " control\n", @@ -886,9 +876,9 @@ " 9992\n", " 0\n", " 0\n", - " 491.5\n", - " 424.000000\n", - " 44\n", + " 491.50\n", + " 424.00\n", + " 44.00\n", " M\n", " E-commerce\n", " control\n", @@ -898,9 +888,9 @@ " 9996\n", " 0\n", " 0\n", - " 500.5\n", - " 430.888889\n", - " 56\n", + " 500.50\n", + " 430.89\n", + " 56.00\n", " F\n", " E-commerce\n", " control\n", @@ -911,18 +901,18 @@ "" ], "text/plain": [ - " user_id signup_month treat pre_spends post_spends age gender \\\n", - "0 3 0 0 501.5 424.333333 31 M \n", - "1 14 0 0 497.0 421.777778 26 M \n", - "2 21 0 0 489.0 433.111111 30 M \n", - "3 28 3 1 479.5 527.888889 20 F \n", - "4 29 0 0 505.0 414.333333 30 M \n", - "... ... ... ... ... ... ... ... \n", - "5365 9988 0 0 501.5 423.222222 55 F \n", - "5366 9990 0 0 490.0 426.000000 18 M \n", - "5367 9991 0 0 482.5 421.888889 23 F \n", - "5368 9992 0 0 491.5 424.000000 44 M \n", - "5369 9996 0 0 500.5 430.888889 56 F \n", + " user_id signup_month treat pre_spends post_spends age gender \\\n", + "0 3 0 0 501.50 424.33 31.00 NaN \n", + "1 14 0 0 497.00 421.78 26.00 M \n", + "2 21 0 0 489.00 433.11 30.00 M \n", + "3 28 3 1 479.50 527.89 20.00 NaN \n", + "4 29 0 0 505.00 414.33 30.00 M \n", + "... ... ... ... ... ... ... ... \n", + "5365 9988 0 0 501.50 423.22 55.00 F \n", + "5366 9990 0 0 490.00 426.00 18.00 M \n", + "5367 9991 0 0 482.50 421.89 23.00 F \n", + "5368 9992 0 0 491.50 424.00 44.00 M \n", + "5369 9996 0 0 500.50 430.89 56.00 F \n", "\n", " industry group \n", "0 Logistics test \n", @@ -940,7 +930,7 @@ "[5370 rows x 9 columns]" ] }, - "execution_count": 52, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -951,29 +941,37 @@ }, { "cell_type": "markdown", - "id": "4912e5df", + "id": "5017639b", "metadata": {}, "source": [ "### 2.2 AA-test with grouping" ] }, + { + "cell_type": "markdown", + "id": "8cb96834", + "metadata": {}, + "source": [ + "To perform experiment that separates samples by groups `group_col` can be used" + ] + }, { "cell_type": "code", - "execution_count": 32, - "id": "b274e7bb", + "execution_count": 12, + "id": "2fba205a", "metadata": {}, "outputs": [], "source": [ "info_cols = ['user_id', 'signup_month']\n", - "target = 'post_spends'\n", + "target = ['post_spends', 'pre_spends']\n", "\n", "group_cols = 'industry'" ] }, { "cell_type": "code", - "execution_count": 33, - "id": "d14469a5", + "execution_count": 13, + "id": "6c42a3c3", "metadata": {}, "outputs": [], "source": [ @@ -982,16 +980,14 @@ }, { "cell_type": "code", - "execution_count": 34, - "id": "0e5ac552", - "metadata": { - "scrolled": false - }, + "execution_count": 14, + "id": "6155253f", + "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5c7d1ef6dfaa46b782ab990a763fca1f", + "model_id": "bf931be558b64216b9ca76eb3eb28d24", "version_major": 2, "version_minor": 0 }, @@ -1001,7 +997,28 @@ }, "metadata": {}, "output_type": "display_data" - }, + } + ], + "source": [ + "experiment_result, dict_of_datas = experiment.search_dist_uniform_sampling()" + ] + }, + { + "cell_type": "markdown", + "id": "a8ae6454", + "metadata": {}, + "source": [ + "The result is in the same format as without groups\n", + "\n", + "In this regime groups equally divided on each sample (test and control):" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "de7bdb7b", + "metadata": {}, + "outputs": [ { "data": { "text/html": [ @@ -1023,144 +1040,79 @@ " \n", " \n", " \n", - " random_state\n", - " post_spends a mean\n", - " post_spends b mean\n", - " post_spends ab delta %\n", - " post_spends t_test p_value\n", - " post_spends ks_test p_value\n", - " post_spends t_test passed\n", - " post_spends ks_test passed\n", - " mean_tests_score\n", + " \n", + " user_id\n", + " \n", + " \n", + " industry\n", + " group\n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " 0\n", - " 428.882431\n", - " 428.882431\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " True\n", - " True\n", - " 1.0\n", + " E-commerce\n", + " control\n", + " 1351\n", " \n", " \n", - " 1\n", - " 1\n", - " 428.602956\n", - " 428.602956\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " True\n", - " True\n", - " 1.0\n", + " test\n", + " 1351\n", " \n", " \n", - " 2\n", - " 2\n", - " 428.846995\n", - " 428.846995\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " True\n", - " True\n", - " 1.0\n", + " Logistics\n", + " control\n", + " 1333\n", + " \n", + " \n", + " test\n", + " 1333\n", " \n", " \n", "\n", "" ], "text/plain": [ - " random_state post_spends a mean post_spends b mean \\\n", - "0 0 428.882431 428.882431 \n", - "1 1 428.602956 428.602956 \n", - "2 2 428.846995 428.846995 \n", - "\n", - " post_spends ab delta % post_spends t_test p_value \\\n", - "0 0.0 1.0 \n", - "1 0.0 1.0 \n", - "2 0.0 1.0 \n", - "\n", - " post_spends ks_test p_value post_spends t_test passed \\\n", - "0 1.0 True \n", - "1 1.0 True \n", - "2 1.0 True \n", - "\n", - " post_spends ks_test passed mean_tests_score \n", - "0 True 1.0 \n", - "1 True 1.0 \n", - "2 True 1.0 " + " user_id\n", + "industry group \n", + "E-commerce control 1351\n", + " test 1351\n", + "Logistics control 1333\n", + " test 1333" ] }, - "execution_count": 34, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "experiment_result, dict_of_datas = experiment.search_dist_uniform_sampling()\n", - "experiment_result.head(3)" + "dict_of_datas[0].groupby(['industry', 'group'])[['user_id']].count()" ] }, { "cell_type": "markdown", - "id": "efdc426d", + "id": "d87c9442", "metadata": {}, "source": [ - "### 2.3 AA-testing with quantifying" + "## 3. AB-test" ] }, { - "cell_type": "code", - "execution_count": 35, - "id": "56ed0a88", + "cell_type": "markdown", + "id": "eecf2909", "metadata": {}, - "outputs": [], "source": [ - "info_cols = ['user_id', 'signup_month']\n", - "target = 'post_spends'\n", - "\n", - "group_cols = 'industry'\n", - "quant_field = 'gender'" + "### 3.0 Data\n", + "Lets correct data to see how AB-test works" ] }, { "cell_type": "code", - "execution_count": 36, - "id": "7872cf08", + "execution_count": 16, + "id": "50e64f07", "metadata": {}, - "outputs": [], - "source": [ - "experiment = AATest(data=data, info_cols=info_cols, target_fields=target, group_cols=group_cols, quant_field=quant_field)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "32827bbe", - "metadata": { - "scrolled": true - }, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "75c1de04cc5b4e8288545baa02ea3ccc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/10 [00:00\n", " \n", " \n", - " random_state\n", - " post_spends a mean\n", - " post_spends b mean\n", - " post_spends ab delta %\n", - " post_spends t_test p_value\n", - " post_spends ks_test p_value\n", - " post_spends t_test passed\n", - " post_spends ks_test passed\n", - " mean_tests_score\n", + " user_id\n", + " signup_month\n", + " treat\n", + " pre_spends\n", + " post_spends\n", + " age\n", + " gender\n", + " industry\n", + " group\n", " \n", " \n", " \n", " \n", " 0\n", " 0\n", - " 428.347612\n", - " 428.347612\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " True\n", - " True\n", - " 1.0\n", + " 0\n", + " 0\n", + " 488.00\n", + " 414.44\n", + " NaN\n", + " M\n", + " E-commerce\n", + " test\n", " \n", " \n", " 1\n", - " 1\n", - " 427.967721\n", - " 427.967721\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " True\n", - " True\n", - " 1.0\n", + " 3\n", + " 0\n", + " 0\n", + " 501.50\n", + " 424.33\n", + " 31.00\n", + " NaN\n", + " Logistics\n", + " test\n", " \n", " \n", " 2\n", - " 2\n", - " 428.347612\n", - " 428.347612\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " True\n", - " True\n", - " 1.0\n", + " 10\n", + " 0\n", + " 0\n", + " 522.50\n", + " 416.22\n", + " 64.00\n", + " M\n", + " E-commerce\n", + " test\n", " \n", " \n", "\n", "" ], "text/plain": [ - " random_state post_spends a mean post_spends b mean \\\n", - "0 0 428.347612 428.347612 \n", - "1 1 427.967721 427.967721 \n", - "2 2 428.347612 428.347612 \n", - "\n", - " post_spends ab delta % post_spends t_test p_value \\\n", - "0 0.0 1.0 \n", - "1 0.0 1.0 \n", - "2 0.0 1.0 \n", - "\n", - " post_spends ks_test p_value post_spends t_test passed \\\n", - "0 1.0 True \n", - "1 1.0 True \n", - "2 1.0 True \n", + " user_id signup_month treat pre_spends post_spends age gender \\\n", + "0 0 0 0 488.00 414.44 NaN M \n", + "1 3 0 0 501.50 424.33 31.00 NaN \n", + "2 10 0 0 522.50 416.22 64.00 M \n", "\n", - " post_spends ks_test passed mean_tests_score \n", - "0 True 1.0 \n", - "1 True 1.0 \n", - "2 True 1.0 " + " industry group \n", + "0 E-commerce test \n", + "1 Logistics test \n", + "2 E-commerce test " ] }, - "execution_count": 37, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "experiment_result, dict_of_datas = experiment.search_dist_uniform_sampling()\n", - "experiment_result.head(3)" + "data_ab = data.copy()\n", + "\n", + "half_data = int(data.shape[0]/2)\n", + "data_ab['group'] = ['test']*half_data + ['control']*half_data\n", + "data_ab.head(3)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "e1b57c6d", + "cell_type": "markdown", + "id": "db1bcefa", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "### 3.1 Full AB-test\n", + "\n", + "Full (basic) version of test includes calculation of all available metrics, which are: \"diff in means\", \"diff in diff\" and \"cuped\"
\n", + "Pay attention, that for \"cuped\" and \"diff in diff\" metrics requred target before pilot." + ] }, { "cell_type": "code", - "execution_count": null, - "id": "1269e282", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "3c8f6bf5", + "execution_count": 17, + "id": "4108a137", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'size': {'test': 2685, 'control': 2685},\n", + " 'difference': {'ate': 0.9805090006207325,\n", + " 'cuped': 0.9764245308837189,\n", + " 'diff_in_diff': 0.39224084419458904},\n", + " 'p_value': {'t_test': 0.20533212744131019,\n", + " 'mann_whitney': 0.08089945933651932}}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Metrics" + "model = ABTest()\n", + "results = model.execute(\n", + " data=data_ab, \n", + " target_field='post_spends', \n", + " target_field_before='pre_spends', \n", + " group_field='group'\n", + ")\n", + "results" ] }, { "cell_type": "markdown", - "id": "8226aef5", + "id": "9ae681ac", "metadata": {}, "source": [ - "## Data" + "To see results in more convenient way `show_beautiful_result` can be used" ] }, { "cell_type": "code", - "execution_count": 244, - "id": "566fb273", + "execution_count": 18, + "id": "bcca83f9", "metadata": {}, "outputs": [ { @@ -1325,533 +1290,293 @@ " \n", " \n", " \n", - " pre_spends\n", - " post_spends\n", - " age\n", - " gender_F\n", - " gender_M\n", - " industry_E-commerce\n", - " industry_Logistics\n", - " group\n", + " size\n", " \n", " \n", " \n", " \n", - " 0\n", - " 501.5\n", - " 424.333333\n", - " 31\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", - " test\n", - " \n", - " \n", - " 1\n", - " 522.5\n", - " 416.222222\n", - " 64\n", - " 0\n", - " 1\n", - " 1\n", - " 0\n", - " test\n", - " \n", - " \n", - " 2\n", - " 472.0\n", - " 423.777778\n", - " 43\n", - " 0\n", - " 1\n", - " 1\n", - " 0\n", - " test\n", - " \n", - " \n", - " 3\n", - " 508.5\n", - " 424.222222\n", - " 36\n", - " 1\n", - " 0\n", - " 1\n", - " 0\n", - " test\n", - " \n", - " \n", - " 4\n", - " 497.0\n", - " 421.777778\n", - " 26\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", - " test\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", + " test\n", + " 2685\n", " \n", " \n", - " 5365\n", - " 467.0\n", - " 431.555556\n", - " 48\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", - " control\n", + " control\n", + " 2685\n", " \n", - " \n", - " 5366\n", - " 501.5\n", - " 423.222222\n", - " 55\n", - " 1\n", - " 0\n", - " 0\n", - " 1\n", - " control\n", + " \n", + "\n", + "" + ], + "text/plain": [ + " size\n", + "test 2685\n", + "control 2685" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
difference
5367482.5421.888889231010controlate0.98
5368491.5424.000000440110controlcuped0.98
5369500.5430.888889561010controldiff_in_diff0.39
\n", - "

5370 rows × 8 columns

\n", "
" ], "text/plain": [ - " pre_spends post_spends age gender_F gender_M industry_E-commerce \\\n", - "0 501.5 424.333333 31 0 1 0 \n", - "1 522.5 416.222222 64 0 1 1 \n", - "2 472.0 423.777778 43 0 1 1 \n", - "3 508.5 424.222222 36 1 0 1 \n", - "4 497.0 421.777778 26 0 1 0 \n", - "... ... ... ... ... ... ... \n", - "5365 467.0 431.555556 48 0 1 0 \n", - "5366 501.5 423.222222 55 1 0 0 \n", - "5367 482.5 421.888889 23 1 0 1 \n", - "5368 491.5 424.000000 44 0 1 1 \n", - "5369 500.5 430.888889 56 1 0 1 \n", - "\n", - " industry_Logistics group \n", - "0 1 test \n", - "1 0 test \n", - "2 0 test \n", - "3 0 test \n", - "4 1 test \n", - "... ... ... \n", - "5365 1 control \n", - "5366 1 control \n", - "5367 0 control \n", - "5368 0 control \n", - "5369 0 control \n", - "\n", - "[5370 rows x 8 columns]" + " difference\n", + "ate 0.98\n", + "cuped 0.98\n", + "diff_in_diff 0.39" ] }, - "execution_count": 244, "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "prep = AATest(data=dict_of_datas[0], target_fields='pre_spends', info_cols=info_cols, group_cols='group')\n", - "data = prep.data\n", - "data" - ] - }, - { - "cell_type": "markdown", - "id": "b70c9f3d", - "metadata": {}, - "source": [ - "## Diff in means" - ] - }, - { - "cell_type": "markdown", - "id": "8cab0659", - "metadata": {}, - "source": [ - "$$\\widehat {ATE}^{simple} = \\bar Y_{t=1, d=1} - \\bar Y_{t=1, d=0}$$" - ] - }, - { - "cell_type": "code", - "execution_count": 300, - "id": "90506e7e", - "metadata": {}, - "outputs": [], - "source": [ - "def diff_in_means(data:pd.DataFrame, target_col: str, groups_col: str): \n", - " \"\"\"Counts difference in means of test and conrol group in target.\n", - " \n", - " Args:\n", - " data: input dataframe\n", - " target_col: name of target column\n", - " groups_col: name of column with separation on test and control group\n", - " should contain values ('test', 'control')\n", - " \n", - " Returns:\n", - " difm: differemnce in means between test and control groups\n", - " \n", - " \"\"\"\n", - " \n", - " mean_test = np.mean(data[data[groups_col]=='test'][target_col]) \n", - " mean_control = np.mean(data[data[groups_col]=='control'][target_col])\n", - " difm = mean_test - mean_control\n", - " \n", - " return difm" - ] - }, - { - "cell_type": "code", - "execution_count": 301, - "id": "47c09f67", - "metadata": {}, - "outputs": [ + "output_type": "display_data" + }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
p_value
t_test0.21
mann_whitney0.08
\n", + "
" + ], "text/plain": [ - "0.0" + " p_value\n", + "t_test 0.21\n", + "mann_whitney 0.08" ] }, - "execution_count": 301, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "diff_in_means(data, 'post_spends', 'group')" - ] - }, - { - "cell_type": "markdown", - "id": "9afc5278", - "metadata": {}, - "source": [ - "## Cuped" - ] - }, - { - "cell_type": "markdown", - "id": "329ea2ae", - "metadata": {}, - "source": [ - "$$\\hat Y^{cuped}_{1} = \\bar Y_1 - \\theta \\bar X + \\theta \\mathbb E [X]$$\n", - "\n", - "$$\n", - "\\begin{aligned}\n", - "\\text{Var} \\left( \\hat Y^{cuped}_{1} \\right) &= \\text{Var} \\left( \\bar Y_1 - \\theta \\bar X + \\theta \\mathbb E [X] \\right) = \\newline\n", - "&= \\text{Var} \\left( Y_1 - \\theta X \\right) / n = \\newline\n", - "&= \\Big( \\text{Var} (Y_1) + \\theta^2 \\text{Var} (X) - 2 \\theta \\text{Cov} (X,Y) \\Big) / n\n", - "\\end{aligned}$$\n", - "\n", - "$$\n", - "\\theta^* = \\frac{\\text{Cov} (X,Y)}{\\text{Var} (X)}\n", - "$$\n", - "\n", - "$$\n", - "\\text{Var} \\left( \\hat Y^{cuped}_{1} \\right) = \\text{Var} (\\bar Y) (1 - \\rho^2), \\text{where}\\;\\rho=corr(X,Y)\n", - "$$\n", - "\n", - "$$\n", - "\\begin{aligned} \n", - "\\widehat {ATE}^{cuped} &= \\hat Y^{cuped}_{1} (D=1) - \\hat Y^{cuped}_{1}(D=0) = \\newline &= \\big( \\bar Y_1 - \\theta \\bar X + \\theta \\mathbb E [X] \\ \\big| \\ D = 1 \\big) - \\big( \\bar Y_1 - \\theta \\bar X + \\theta \\mathbb E [X] \\ \\big| \\ D = 0 \\big) = \\newline &= \\big( \\bar Y_1 - \\theta \\bar X \\ \\big| \\ D = 1 \\big) - \\big( \\bar Y_1 - \\theta \\bar X \\ \\big| \\ D = 0 \\big) \\end{aligned}\n", - "$$\n", - "\n", - "$$\n", - "\\hat Y_{cuped,1} = \\bar Y_1 - \\theta \\bar X\n", - "$$" - ] - }, - { - "cell_type": "code", - "execution_count": 302, - "id": "bb64a84e", - "metadata": {}, - "outputs": [], - "source": [ - "from collections import namedtuple\n", - "ExperimentComparisonResults = namedtuple('ExperimentComparisonResults', \n", - " ['pvalue', 'effect', 'ci_length', 'left_bound', 'right_bound'])" + "model.show_beautiful_result()" ] }, { "cell_type": "markdown", - "id": "4afb0b8c", - "metadata": {}, - "source": [ - "#### Absolute" - ] - }, - { - "cell_type": "code", - "execution_count": 303, - "id": "bf1a2375", + "id": "9832d12b", "metadata": {}, - "outputs": [], "source": [ - "import scipy.stats as ss\n", - "from typing import Union\n", - "\n", - "def absolute_ttest(\n", - " control: Union[pd.DataFrame, pd.Series], \n", - " test: Union[pd.DataFrame, pd.Series], \n", - " control_before: Union[pd.DataFrame, pd.Series], \n", - " test_before: Union[pd.DataFrame, pd.Series]\n", - "):\n", - " \"\"\"Counts CUPED (Controlled-Experiment using Pre-Experiment Data) in absolute values.\n", - " \n", - " Args:\n", - " control: target data after pilot in control group\n", - " test: target data after pilot in test group\n", - " \n", - " Returns:\n", - " result: named tuple with pvalue, effect, ci_length, left_bound and right_bound\n", - " \"\"\"\n", - " theta = (np.cov(control, control_before)[0, 1] + np.cov(test, test_before)[0, 1]) \\\n", - " / (np.var(control_before) + np.var(test_before))\n", - "\n", - " control = control - theta * control_before\n", - " test = test - theta * test_before\n", - " \n", - " mean_control = np.mean(control)\n", - " mean_test = np.mean(test)\n", - " var_mean_control = np.var(control) / len(control)\n", - " var_mean_test = np.var(test) / len(test)\n", - " \n", - " difference_mean = mean_test - mean_control\n", - " difference_mean_var = var_mean_control + var_mean_test\n", - " difference_distribution = ss.norm(loc=difference_mean, scale=np.sqrt(difference_mean_var))\n", - "\n", - " left_bound, right_bound = difference_distribution.ppf([0.025, 0.975])\n", - " ci_length = (right_bound - left_bound)\n", - " pvalue = 2 * min(difference_distribution.cdf(0), difference_distribution.sf(0))\n", - " effect = difference_mean\n", - " result = ExperimentComparisonResults(pvalue, effect, ci_length, left_bound, right_bound)\n", - "\n", - " return result\n", - "\n", - "# https://github.com/DimaLunin/AB_lifehacks/blob/8fbbcc6a82440bdb593ac670994f1f8c7a5ef2f3/CUPED.ipynb#L57" + "### 3.2 Simple AB-test\n", + "To estimate effect without target data before pilot `calc_difference_method='ate'` can be used - effect will be estimated with \"diff in means\" method" ] }, { "cell_type": "code", - "execution_count": 304, - "id": "ac70f2bd", + "execution_count": 19, + "id": "ec6847ce", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
size
test2685
control2685
\n", + "
" + ], "text/plain": [ - "ExperimentComparisonResults(pvalue=1.0, effect=0.0, ci_length=2.973441563457471, left_bound=-1.4867207817287356, right_bound=1.4867207817287353)" + " size\n", + "test 2685\n", + "control 2685" ] }, - "execution_count": 304, "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "absolute_ttest(\n", - " data[data['group']=='control']['post_spends'],\n", - " data[data['group']=='test']['post_spends'],\n", - " data[data['group']=='control']['pre_spends'],\n", - " data[data['group']=='test']['pre_spends']\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "050ead63", - "metadata": {}, - "source": [ - "#### Relative" - ] - }, - { - "cell_type": "code", - "execution_count": 305, - "id": "d852dca8", - "metadata": {}, - "outputs": [], - "source": [ - "def relative_cuped(\n", - " control: Union[pd.DataFrame, pd.Series], \n", - " test: Union[pd.DataFrame, pd.Series], \n", - " control_before: Union[pd.DataFrame, pd.Series], \n", - " test_before: Union[pd.DataFrame, pd.Series]\n", - "):\n", - " theta = (np.cov(control, control_before)[0, 1] + np.cov(test, test_before)[0, 1]) /\\\n", - " (np.var(control_before) + np.var(test_before))\n", - "\n", - " control_cup = control - theta * control_before\n", - " test_cup = test - theta * test_before\n", - "\n", - " mean_den = np.mean(control)\n", - " mean_num = np.mean(test_cup) - np.mean(control_cup)\n", - " var_mean_den = np.var(control) / len(control)\n", - " var_mean_num = np.var(test_cup) / len(test_cup) + np.var(control_cup) / len(control_cup)\n", - "\n", - " cov = -np.cov(control_cup, control)[0, 1] / len(control)\n", - "\n", - " relative_mu = mean_num / mean_den\n", - " relative_var = var_mean_num / (mean_den ** 2) + var_mean_den * ((mean_num ** 2) / (mean_den ** 4))\\\n", - " - 2 * (mean_num / (mean_den ** 3)) * cov\n", - " \n", - " relative_distribution = ss.norm(loc=relative_mu, scale=np.sqrt(relative_var))\n", - " left_bound, right_bound = relative_distribution.ppf([0.025, 0.975])\n", - " \n", - " ci_length = (right_bound - left_bound)\n", - " pvalue = 2 * min(relative_distribution.cdf(0), relative_distribution.sf(0))\n", - " effect = relative_mu\n", - " return ExperimentComparisonResults(pvalue, effect, ci_length, left_bound, right_bound)" - ] - }, - { - "cell_type": "code", - "execution_count": 306, - "id": "91ae9bfe", - "metadata": {}, - "outputs": [ + "output_type": "display_data" + }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
difference
ate0.98
\n", + "
" + ], "text/plain": [ - "ExperimentComparisonResults(pvalue=1.0, effect=0.0, ci_length=0.006949761458391225, left_bound=-0.0034748807291956124, right_bound=0.003474880729195612)" + " difference\n", + "ate 0.98" ] }, - "execution_count": 306, "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "relative_cuped(\n", - " data[data['group']=='control']['post_spends'],\n", - " data[data['group']=='test']['post_spends'],\n", - " data[data['group']=='control']['pre_spends'],\n", - " data[data['group']=='test']['pre_spends']\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "8d6c9119", - "metadata": {}, - "source": [ - "#### Diff in diff" - ] - }, - { - "cell_type": "markdown", - "id": "005d82a1", - "metadata": {}, - "source": [ - "$$\\widehat {ATE}^{DiD} = \\big( \\bar Y_{t=1, d=1} - \\bar Y_{t=1, d=0} \\big) - \\big( \\bar Y_{t=0, d=1} - \\bar Y_{t=0, d=0} \\big)$$" - ] - }, - { - "cell_type": "code", - "execution_count": 307, - "id": "f41e9626", - "metadata": {}, - "outputs": [], - "source": [ - "def diff_in_diff(data, target_col, target_col_before, groups_col):\n", - " \"\"\"Counts Difference in Difference.\n", - " \n", - " Metric uses pre-post analisys and counts differece in means in data before and after pilot:\n", - " ATE = (y_test_after - y_control_after) - (y_test_before - y_control_before)\n", - " \n", - " Args:\n", - " data: input data\n", - " target_col: column name of target after pilot\n", - " target_col_before: column name of target before pilot\n", - " groups_col: name of column with separation on test and control group\n", - " should contain values ('test', 'control')\n", - " \n", - " Returns:\n", - " did: value of difference in difference\n", - " \"\"\"\n", - " mean_test = np.mean(data[data[groups_col]=='test'][target_col]) \n", - " mean_control = np.mean(data[data[groups_col]=='control'][target_col])\n", - " \n", - " mean_test_before = np.mean(data[data[groups_col]=='test'][target_col_before]) \n", - " mean_control_before = np.mean(data[data[groups_col]=='control'][target_col_before])\n", - " did = (mean_test - mean_control) - (mean_test_before-mean_control_before)\n", - " \n", - " return did" - ] - }, - { - "cell_type": "code", - "execution_count": 308, - "id": "3df6d620", - "metadata": {}, - "outputs": [ + "output_type": "display_data" + }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
p_value
t_test0.21
mann_whitney0.08
\n", + "
" + ], "text/plain": [ - "0.0" + " p_value\n", + "t_test 0.21\n", + "mann_whitney 0.08" ] }, - "execution_count": 308, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "diff_in_diff(data, 'post_spends', 'pre_spends', 'group')" + "model = ABTest(calc_difference_method='ate')\n", + "model.execute(data=data_ab, target_field='post_spends', group_field='group')\n", + "\n", + "model.show_beautiful_result()" ] }, { "cell_type": "code", "execution_count": null, - "id": "5cd8835c", + "id": "6d77a694", "metadata": {}, "outputs": [], "source": [] diff --git a/lightautoml/addons/hypex/ABTesting/ab_tester.py b/lightautoml/addons/hypex/ABTesting/ab_tester.py index 7aa3bc04..9e6eb082 100644 --- a/lightautoml/addons/hypex/ABTesting/ab_tester.py +++ b/lightautoml/addons/hypex/ABTesting/ab_tester.py @@ -1,9 +1,9 @@ import warnings +from IPython.display import display from collections import namedtuple -from abc import ABC, abstractmethod from pathlib import Path from sklearn.utils import shuffle -from typing import Iterable, Union, Optional, List, Dict, Any +from typing import Iterable, Union, Optional, Dict, Any from tqdm.auto import tqdm @@ -11,20 +11,10 @@ import numpy as np from scipy.stats import ttest_ind, ks_2samp, mannwhitneyu -RANDOM_STATE = 52 - -ExperimentComparisonResults_cuped = namedtuple( - 'CUPED', - ['pvalue', 'effect', 'ci_length', 'left_bound', 'right_bound'] -) -ExperimentComparisonResults_cuped_abs = namedtuple( - 'CUPED_abs', - ['pvalue', 'effect', 'ci_length', 'left_bound', 'right_bound'] -) - def merge_groups( - test_group: Union[Iterable[pd.DataFrame], pd.DataFrame], control_group: Union[Iterable[pd.DataFrame], pd.DataFrame], + test_group: Union[Iterable[pd.DataFrame], pd.DataFrame], + control_group: Union[Iterable[pd.DataFrame], pd.DataFrame], ): """Merges test and control groups in one DataFrame and creates column "group". @@ -37,11 +27,6 @@ def merge_groups( Returns: merged_data: Concatted DataFrame """ - # if not isinstance(test_group, pd.DataFrame): - # test_group = pd.concat(test_group, ignore_index=True) - # if not isinstance(control_group, pd.DataFrame): - # control_group = pd.concat(control_group, ignore_index=True) - test_group.loc[:, "group"] = "test" control_group.loc[:, "group"] = "control" @@ -50,106 +35,15 @@ def merge_groups( return merged_data -def cuped( - test_data: pd.DataFrame, - control_data: pd.DataFrame, - target_field: str, - target_field_before: str -): - """Counts CUPED (Controlled-Experiment using Pre-Experiment Data) in absolute values. - - Metric uses pre-post analysis of target, uses to minimize variance of effect: - ATE = mean(test_cuped) - mean(control_cuped) - , where - test_cuped = target__test - theta * target_before__test - control_cuped = target__control - theta * target_before__control - , where - theta = (cov_test + cov_control) / (var_test + var_control) - , where - cov_test = cov(target__test, target_before__test) - cov_control = cov(target__control, target_before__control) - var_test = var(target_before__test) - var_control = var(target_before__control) - - Args: - test_data: input data of test group - Should include target before and after pilot - control_data: input data of control group - Should include target before and after pilot - target_field: column name of target after pilot - target_field_before: column name of target before pilot - - Returns: - result: named tuple with pvalue, effect, ci_length, left_bound and right_bound - """ - control = control_data[target_field] - control_before = control_data[target_field_before] - test = test_data[target_field] - test_before = test_data[target_field_before] - - theta = (np.cov(control, control_before)[0, 1] + np.cov(test, test_before)[0, 1]) \ - / (np.var(control_before) + np.var(test_before)) - - control = control - theta * control_before - test = test - theta * test_before - - mean_control = np.mean(control) - mean_test = np.mean(test) - var_mean_control = np.var(control) / len(control) - var_mean_test = np.var(test) / len(test) - - difference_mean = mean_test - mean_control - difference_mean_var = var_mean_control + var_mean_test - difference_distribution = ss.norm(loc=difference_mean, scale=np.sqrt(difference_mean_var)) - - left_bound, right_bound = difference_distribution.ppf([0.025, 0.975]) - ci_length = (right_bound - left_bound) - pvalue = 2 * min(difference_distribution.cdf(0), difference_distribution.sf(0)) - effect = difference_mean - result = ExperimentComparisonResults_cuped(pvalue, effect, ci_length, left_bound, right_bound) - - return result - - -def diff_in_diff( - test_data: pd.DataFrame, - control_data: pd.DataFrame, - target_field: str, - target_field_before: str -): - """Counts Difference in Difference. - - Metric uses pre-post analysis and counts difference in means in data before and after pilot: - ATE = (y_test_after - y_control_after) - (y_test_before - y_control_before) - - Args: - test_data: input data of test group - control_data: input data of control group - target_field: column name of target after pilot - target_field_before: column name of target before pilot - - Returns: - did: value of difference in difference - """ - mean_test = np.mean(test_data[target_field]) - mean_control = np.mean(control_data[target_field]) - - mean_test_before = np.mean(test_data[target_field_before]) - mean_control_before = np.mean(control_data[target_field_before]) - did = (mean_test - mean_control) - (mean_test_before - mean_control_before) - - return did - - class AATest: def __init__( - self, - data: pd.DataFrame, - target_fields: Union[Iterable[str], str], - info_cols: Union[Iterable[str], str] = None, - group_cols: Union[str, Iterable[str]] = None, - quant_field: str = None, - mode: str = "simple" + self, + data: pd.DataFrame, + target_fields: Union[Iterable[str], str], + info_cols: Union[Iterable[str], str] = None, + group_cols: Union[str, Iterable[str]] = None, + quant_field: str = None, + mode: str = "simple" ): """ @@ -190,7 +84,7 @@ def _preprocessing_data(self): init_cols = data.columns dont_binarize_cols = ( # collects names of columns that shouldn't be binarized - self.group_cols+[self.quant_field] + self.group_cols + [self.quant_field] if (self.group_cols is not None) and (self.quant_field is not None) else self.group_cols if self.group_cols is not None @@ -208,10 +102,12 @@ def _preprocessing_data(self): # fix if dummy_na is const=0 dummies_cols = set(data.columns) - set(init_cols) const_columns = [col for col in dummies_cols if data[col].nunique() <= 1] # choose constant_columns + + # drop constant dummy columns and info columns cols_to_drop = const_columns + (self.info_cols if self.info_cols is not None else []) self.data = data.drop(columns=cols_to_drop) - def __simple_mode(self, data: pd.DataFrame, random_state: int = RANDOM_STATE): + def __simple_mode(self, data: pd.DataFrame, random_state: int = None): """Separates data on A and B samples within simple mode. Separation performed to divide groups of equal sizes - equal amount of records @@ -240,7 +136,7 @@ def __simple_mode(self, data: pd.DataFrame, random_state: int = RANDOM_STATE): return result - def split(self, random_state: int = RANDOM_STATE) -> Dict: + def split(self, random_state: int = None) -> Dict: """Divides sample on two groups. Args: @@ -309,7 +205,7 @@ def _postprep_data(self, spit_indexes: Dict = None): return data - def sampling_metrics(self, alpha: float = 0.05, random_state: int = RANDOM_STATE): + def sampling_metrics(self, alpha: float = 0.05, random_state: int = None): """ Args: @@ -339,7 +235,7 @@ def sampling_metrics(self, alpha: float = 0.05, random_state: int = RANDOM_STATE t_result[f"{tf} a mean"] = ta.mean() t_result[f"{tf} b mean"] = tb.mean() t_result[f"{tf} ab delta %"] = (1 - t_result[f"{tf} a mean"] / t_result[f"{tf} b mean"]) * 100 - t_result[f"{tf} t_test p_value"] = ttest_ind(ta, tb).pvalue + t_result[f"{tf} t_test p_value"] = ttest_ind(ta, tb, nan_policy='omit').pvalue t_result[f"{tf} ks_test p_value"] = ks_2samp(ta, tb).pvalue t_result[f"{tf} t_test passed"] = t_result[f"{tf} t_test p_value"] > alpha t_result[f"{tf} ks_test passed"] = t_result[f"{tf} ks_test p_value"] > alpha @@ -351,13 +247,13 @@ def sampling_metrics(self, alpha: float = 0.05, random_state: int = RANDOM_STATE return result def search_dist_uniform_sampling( - self, - alpha: float = 0.05, - iterations: int = 10, - file_name: Union[Path, str] = None, - write_mode: str = "full", - write_step: int = None, - pbar: bool = True, + self, + alpha: float = 0.05, + iterations: int = 10, + file_name: Union[Path, str] = None, + write_mode: str = "full", + write_step: int = None, + pbar: bool = True, ) -> Optional[tuple[pd.DataFrame, dict[Any, dict]]]: """Chooses random_state for finding homogeneous distribution. @@ -426,9 +322,9 @@ def search_dist_uniform_sampling( class ABTest: def __init__( - self, - calc_difference_method: str = "all", - calc_p_value_method: str = "all", + self, + calc_difference_method: str = "all", + calc_p_value_method: str = "all", ): """Initializes the ABTest class. @@ -442,19 +338,21 @@ def __init__( 'cuped' - Controlled-Experiment using Pre-Experiment Data value, performs pre-post analysis (required values of target before pilot) calc_p_value_method: - The method used to calculate the p-value. Defaults to 'all'. + The method used to calculate the p-value. Defaults to 'all' """ self.calc_difference_method = calc_difference_method self.calc_p_value_method = calc_p_value_method + self.results = None @staticmethod def split_ab(data: pd.DataFrame, group_field: str) -> Dict[str, pd.DataFrame]: - """ - Splits a pandas DataFrame into two separate dataframes based on a specified group field. + """Splits a pandas DataFrame into two separate dataframes based on a specified group field. - Parameters: - data: The input dataframe to be split. - group_field: The column name representing the group field. + Args: + data: + The input dataframe to be split + group_field: + The column name representing the group field Returns: splitted_data: @@ -467,6 +365,91 @@ def split_ab(data: pd.DataFrame, group_field: str) -> Dict[str, pd.DataFrame]: } return splitted_data + @staticmethod + def cuped( + test_data: pd.DataFrame, + control_data: pd.DataFrame, + target_field: str, + target_field_before: str + ): + """Counts CUPED (Controlled-Experiment using Pre-Experiment Data) in absolute values. + + Metric uses pre-post analysis of target, uses to minimize variance of effect: + ATE = mean(test_cuped) - mean(control_cuped) + , where + test_cuped = target__test - theta * target_before__test + control_cuped = target__control - theta * target_before__control + , where + theta = (cov_test + cov_control) / (var_test + var_control) + , where + cov_test = cov(target__test, target_before__test) + cov_control = cov(target__control, target_before__control) + var_test = var(target_before__test) + var_control = var(target_before__control) + + Args: + test_data: + Input data of test group + Should include target before and after pilot + control_data: + Input data of control group + Should include target before and after pilot + target_field: + Column name of target after pilot + target_field_before: + Column name of target before pilot + + Returns: + result: + Named tuple with pvalue, effect, ci_length, left_bound and right_bound + """ + control = control_data[target_field] + control_before = control_data[target_field_before] + test = test_data[target_field] + test_before = test_data[target_field_before] + + theta = ((np.cov(control, control_before)[0, 1] + np.cov(test, test_before)[0, 1]) + / (np.var(control_before) + np.var(test_before))) + + control_cuped = control - theta * control_before + test_cuped = test - theta * test_before + + mean_control = np.mean(control_cuped) + mean_test = np.mean(test_cuped) + + difference_mean = mean_test - mean_control + + return difference_mean + + @staticmethod + def diff_in_diff( + test_data: pd.DataFrame, + control_data: pd.DataFrame, + target_field: str, + target_field_before: str + ): + """Counts Difference in Difference. + + Metric uses pre-post analysis and counts difference in means in data before and after pilot: + ATE = (y_test_after - y_control_after) - (y_test_before - y_control_before) + + Args: + test_data: input data of test group + control_data: input data of control group + target_field: column name of target after pilot + target_field_before: column name of target before pilot + + Returns: + did: value of difference in difference + """ + mean_test = np.mean(test_data[target_field]) + mean_control = np.mean(control_data[target_field]) + + mean_test_before = np.mean(test_data[target_field_before]) + mean_control_before = np.mean(control_data[target_field_before]) + did = (mean_test - mean_control) - (mean_test_before - mean_control_before) + + return did def calc_difference( self, @@ -474,38 +457,45 @@ def calc_difference( target_field: str, target_field_before: str = None ) -> Dict[str, float]: - """ - Calculates the difference between the target field values of the 'test' and 'control' dataframes. + """Calculates the difference between the target field values of the 'test' and 'control' dataframes. - Parameters: - splitted_data: A dictionary containing the 'test' and 'control' dataframes - target_field: The name of the target field contains data after pilot - target_field_before: The name of the target field contains data before pilot + Args: + splitted_data: + A dictionary containing the 'test' and 'control' dataframes + target_field: + The name of the target field contains data after pilot + target_field_before: + The name of the target field contains data before pilot Returns: - result: A dictionary containing the difference between the target field - values of the 'test' and 'control' dataframes. + result: + A dictionary containing the difference between the target field + values of the 'test' and 'control' dataframes """ result = {} if self.calc_difference_method in {"all", "diff_in_diff", "cuped"}: if target_field_before is None: raise ValueError( - "For calculation metrics 'cuped' or 'diff_in_diff' field 'target_field_before' is required" + "For calculation metrics 'cuped' or 'diff_in_diff' field 'target_field_before' is required.\n" + "Metric 'ate'(=diff-in-means) can be used without 'target_field_before'" ) + if self.calc_difference_method in {"all", "ate"}: result["ate"] = ( splitted_data["test"][target_field].values - splitted_data["control"][target_field].values ).mean() + if self.calc_difference_method in {"all", "cuped"}: - result['cuped'] = cuped( + result['cuped'] = self.cuped( test_data=splitted_data["test"], control_data=splitted_data["control"], target_field=target_field, target_field_before=target_field_before ) + if self.calc_difference_method in {"all", "diff_in_diff"}: - result['diff_in_diff'] = diff_in_diff( + result['diff_in_diff'] = self.diff_in_diff( test_data=splitted_data["test"], control_data=splitted_data["control"], target_field=target_field, @@ -515,16 +505,20 @@ def calc_difference( return result def calc_p_value( - self, splitted_data: Dict[str, pd.DataFrame], target_field: str + self, splitted_data: Dict[str, pd.DataFrame], target_field: str ) -> Dict[str, float]: - """ - Calculates the p-value for a given data set. + """Calculates the p-value for a given data set. Args: - splitted_data (Dict[str, pd.DataFrame]): A dictionary containing the split data, where the keys are 'test' and 'control' and the values are pandas DataFrames. - target_field (str): The name of the target field. + splitted_data: + A dictionary containing the split data, where the keys are 'test' and 'control' + and the values are pandas DataFrames + target_field: + The name of the target field Returns: - Dict[str, float]: A dictionary containing the calculated p-values, where the keys are 't_test' and 'mann_whitney' and the values are the corresponding p-values. + result: + A dictionary containing the calculated p-values, where the keys are 't_test' and 'mann_whitney' + and the values are the corresponding p-values """ result = {} if self.calc_p_value_method in {"all", "t_test"}: @@ -532,36 +526,47 @@ def calc_p_value( splitted_data["test"][target_field], splitted_data["control"][target_field], ).pvalue + if self.calc_p_value_method in {"all", "mann_whitney"}: result["mann_whitney"] = mannwhitneyu( splitted_data["test"][target_field], splitted_data["control"][target_field], ).pvalue + return result def execute( - self, data: pd.DataFrame, target_field: str, group_field: str + self, data: pd.DataFrame, target_field: str, group_field: str, target_field_before: str = None ) -> Dict[str, Dict[str, float]]: - """ - Executes the function by splitting the input data based on the group field and calculating the size, difference, and p-value. + """Splits the input data based on the group field and calculates the size, difference, and p-value. Parameters: - data (pd.DataFrame): The input data as a pandas DataFrame. - target_field (str): The target field to be analyzed. - group_field (str): The field used to split the data into groups. + data: The input data as a pandas DataFrame. + target_field: The target field to be analyzed. + group_field: The field used to split the data into groups. Returns: - Dict[str, Dict[str, float]]: A dictionary containing the size, difference, and p-value of the split data. - - 'size': A dictionary with the sizes of the test and control groups. - - 'difference': A dictionary with the calculated differences between the groups. - - 'p_value': A dictionary with the calculated p-values for each group. + results: + A dictionary containing the size, difference, and p-value of the split data + 'size': A dictionary with the sizes of the test and control groups + 'difference': A dictionary with the calculated differences between the groups + 'p_value': A dictionary with the calculated p-values for each group """ splitted_data = self.split_ab(data, group_field) - return { + + results = { "size": { "test": len(splitted_data["test"]), "control": len(splitted_data["control"]), }, - "difference": self.calc_difference(splitted_data, target_field), + "difference": self.calc_difference(splitted_data, target_field, target_field_before), "p_value": self.calc_p_value(splitted_data, target_field), } + + self.results = results + + return results + + def show_beautiful_result(self): + for k in self.results.keys(): + display(pd.DataFrame(self.results[k], index=[k]).T) diff --git a/lightautoml/addons/hypex/tests/test_ab.py b/lightautoml/addons/hypex/tests/test_ab.py index 766631db..f588ca1d 100644 --- a/lightautoml/addons/hypex/tests/test_ab.py +++ b/lightautoml/addons/hypex/tests/test_ab.py @@ -1,58 +1,69 @@ -import pytest -import pandas as pd from lightautoml.addons.hypex.ABTesting.ab_tester import ABTest - - -@pytest.fixture -def ab_test(): - return ABTest() - - -@pytest.fixture -def data(): - return pd.DataFrame( - {"group": ["test", "test", "control", "control"], "value": [1, 2, 3, 4]} +from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data + + +# def test_split_ab(): +# data = create_test_data() +# half_data = int(data.shape[0] / 2) +# data['group'] = ['test'] * half_data + ['control'] * half_data +# +# group_field = 'group' +# +# model = ABTest() +# splitted_data = model.split_ab(data, group_field) +# +# assert isinstance(splitted_data, dict), "result of split_ab is not dict" +# assert len(splitted_data) == 2, "split_ab contains not of 2 values" +# assert list(splitted_data.keys()) == ['test', 'control'], "changed keys in result of split_ab" +# +# +# def test_calc_difference(): +# data = create_test_data() +# half_data = int(data.shape[0] / 2) +# data['group'] = ['test'] * half_data + ['control'] * half_data +# +# group_field = 'group' +# target_field = 'post_spends' +# +# model = ABTest() +# splitted_data = model.split_ab(data, group_field) +# differences = model.calc_difference(splitted_data, target_field) +# +# assert isinstance(differences, dict), "result of calc_difference is not dict" + + +def test_calc_p_value(): + data = create_test_data() + half_data = int(data.shape[0] / 2) + data['group'] = ['test'] * half_data + ['control'] * half_data + + group_field = 'group' + target_field = 'post_spends' + + model = ABTest() + splitted_data = model.split_ab(data, group_field) + pvalues = model.calc_p_value(splitted_data, target_field) + + assert isinstance(pvalues, dict), "result of calc_p_value is not dict" + + +def test_execute(): + data = create_test_data() + half_data = int(data.shape[0] / 2) + data['group'] = ['test'] * half_data + ['control'] * half_data + + target_field = 'post_spends' + target_field_before = 'pre_spends' + group_field = 'group' + + model = ABTest() + result = model.execute( + data=data, + target_field=target_field, + target_field_before=target_field_before, + group_field=group_field ) - -@pytest.fixture -def target_field(): - return "value" - - -@pytest.fixture -def group_field(): - return "group" - - -def test_split_ab(ab_test, data, group_field): - expected_result = { - "test": pd.DataFrame({"group": ["test", "test"], "value": [1, 2]}), - "control": pd.DataFrame({"group": ["control", "control"], "value": [3, 4]}), - } - result = ab_test.split_ab(data, group_field) - assert result == expected_result - - -def test_calc_difference(ab_test, data, group_field, target_field): - splitted_data = ab_test.split_ab(data, group_field) - expected_result = {"ate": -1.0} - result = ab_test.calc_difference(splitted_data, target_field) - assert result == expected_result - - -def test_calc_p_value(ab_test, data, group_field, target_field): - splitted_data = ab_test.split_ab(data, group_field) - expected_result = {"t_test": 0.5714285714285714, "mann_whitney": 0.3333333333333333} - result = ab_test.calc_p_value(splitted_data, target_field) - assert result == expected_result - - -def test_execute(ab_test, data, group_field, target_field): - expected_result = { - "size": {"test": 2, "control": 2}, - "difference": {"ate": -1.0}, - "p_value": {"t_test": 0.5714285714285714, "mann_whitney": 0.3333333333333333}, - } - result = ab_test.execute(data, target_field, group_field) - assert result == expected_result + assert isinstance(result, dict), "result of func execution is not dict" + assert len(result) == 3, "result of execution is changed, len of dict was 3" + assert list(result.keys()) == ['size', 'difference', 'p_value'] diff --git a/lightautoml/addons/hypex/utils/tutorial_data_creation.py b/lightautoml/addons/hypex/utils/tutorial_data_creation.py index 781627f8..ddf6ae09 100644 --- a/lightautoml/addons/hypex/utils/tutorial_data_creation.py +++ b/lightautoml/addons/hypex/utils/tutorial_data_creation.py @@ -90,6 +90,9 @@ def create_test_data( """ if rs is not None: np.random.seed(rs) + + if (nan_cols is not None) and isinstance(nan_cols, str): + nan_cols = [nan_cols] # Simulating dataset with known effect size num_months = 12 From ceda4455e045dd8e869793451c627843da8ae64f Mon Sep 17 00:00:00 2001 From: 20810012 Date: Wed, 27 Sep 2023 16:39:23 +0300 Subject: [PATCH 18/20] Tutorial corrected --- Tutorial_13_ABtesting.ipynb | 1257 ++--------------------------------- 1 file changed, 49 insertions(+), 1208 deletions(-) diff --git a/Tutorial_13_ABtesting.ipynb b/Tutorial_13_ABtesting.ipynb index fb568b9e..23e55e37 100644 --- a/Tutorial_13_ABtesting.ipynb +++ b/Tutorial_13_ABtesting.ipynb @@ -1,25 +1,11 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "e296125b", - "metadata": {}, - "source": [ - "1. Туториал AB-теста\n", - " - execute c all\n", - " - execute с ate\n", - " \n", - "2. Сделать функцию, которая краиво в датафреймах выводит метрики\n", - "3. Тесты сделать\n", - "4. " - ] - }, { "cell_type": "markdown", "id": "64e2de80", "metadata": {}, "source": [ - "# How to perform AA test\n", + "# How to perform AA and AB tests\n", "*AB-test is shown below*" ] }, @@ -83,210 +69,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "7b655d2d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length of na_step is less than length of columns. Used last value several times\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idsignup_monthtreatpre_spendspost_spendsagegenderindustry
0000488.00414.44NaNME-commerce
1300501.50424.3331.00NaNLogistics
21000522.50416.2264.00ME-commerce
31200472.00423.7843.00ME-commerce
41300508.50424.2236.00FE-commerce
...........................
5365999100482.50421.8923.00FE-commerce
5366999200491.50424.0044.00ME-commerce
5367999400486.00423.7827.00FLogistics
5368999600500.50430.8956.00FE-commerce
5369999731473.00534.1156.00MLogistics
\n", - "

5370 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " user_id signup_month treat pre_spends post_spends age gender \\\n", - "0 0 0 0 488.00 414.44 NaN M \n", - "1 3 0 0 501.50 424.33 31.00 NaN \n", - "2 10 0 0 522.50 416.22 64.00 M \n", - "3 12 0 0 472.00 423.78 43.00 M \n", - "4 13 0 0 508.50 424.22 36.00 F \n", - "... ... ... ... ... ... ... ... \n", - "5365 9991 0 0 482.50 421.89 23.00 F \n", - "5366 9992 0 0 491.50 424.00 44.00 M \n", - "5367 9994 0 0 486.00 423.78 27.00 F \n", - "5368 9996 0 0 500.50 430.89 56.00 F \n", - "5369 9997 3 1 473.00 534.11 56.00 M \n", - "\n", - " industry \n", - "0 E-commerce \n", - "1 Logistics \n", - "2 E-commerce \n", - "3 E-commerce \n", - "4 E-commerce \n", - "... ... \n", - "5365 E-commerce \n", - "5366 E-commerce \n", - "5367 Logistics \n", - "5368 E-commerce \n", - "5369 Logistics \n", - "\n", - "[5370 rows x 8 columns]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "data = create_test_data(rs=52, na_step=10, nan_cols=['age', 'gender'])\n", "data" @@ -311,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "bc8e4ac0", "metadata": {}, "outputs": [], @@ -332,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "701d20c0", "metadata": {}, "outputs": [], @@ -342,32 +128,17 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "560634e5", + "execution_count": null, + "id": "a7ee57bf", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b3dbdfb331e644d783741a09861635f2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/10 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
random_statepost_spends a meanpost_spends b meanpost_spends ab delta %post_spends t_test p_valuepost_spends ks_test p_valuepost_spends t_test passedpost_spends ks_test passedpre_spends a meanpre_spends b meanpre_spends ab delta %pre_spends t_test p_valuepre_spends ks_test p_valuepre_spends t_test passedpre_spends ks_test passedmean_tests_score
00427.85427.850.001.001.00TrueTrue484.63484.630.001.001.00TrueTrue1.00
11427.67427.670.001.001.00TrueTrue484.81484.810.001.001.00TrueTrue1.00
22428.38428.380.001.001.00TrueTrue484.76484.760.001.001.00TrueTrue1.00
\n", - "" - ], - "text/plain": [ - " random_state post_spends a mean post_spends b mean \\\n", - "0 0 427.85 427.85 \n", - "1 1 427.67 427.67 \n", - "2 2 428.38 428.38 \n", - "\n", - " post_spends ab delta % post_spends t_test p_value \\\n", - "0 0.00 1.00 \n", - "1 0.00 1.00 \n", - "2 0.00 1.00 \n", - "\n", - " post_spends ks_test p_value post_spends t_test passed \\\n", - "0 1.00 True \n", - "1 1.00 True \n", - "2 1.00 True \n", - "\n", - " post_spends ks_test passed pre_spends a mean pre_spends b mean \\\n", - "0 True 484.63 484.63 \n", - "1 True 484.81 484.81 \n", - "2 True 484.76 484.76 \n", - "\n", - " pre_spends ab delta % pre_spends t_test p_value \\\n", - "0 0.00 1.00 \n", - "1 0.00 1.00 \n", - "2 0.00 1.00 \n", - "\n", - " pre_spends ks_test p_value pre_spends t_test passed \\\n", - "0 1.00 True \n", - "1 1.00 True \n", - "2 1.00 True \n", - "\n", - " pre_spends ks_test passed mean_tests_score \n", - "0 True 1.00 \n", - "1 True 1.00 \n", - "2 True 1.00 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "experiment_result.head(3)" ] }, { "cell_type": "markdown", - "id": "e40adf5e", + "id": "76897df9", "metadata": {}, "source": [ "`dict_of_datas` is a dictionary with random_states as keys and dataframes as values.
\n", @@ -540,102 +168,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "cac4e650", "metadata": { "scrolled": false }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idsignup_monthtreatpre_spendspost_spendsagegenderindustrygroup
0300501.50424.3331.00NaNLogisticstest
11000522.50416.2264.00ME-commercetest
21200472.00423.7843.00ME-commercetest
\n", - "
" - ], - "text/plain": [ - " user_id signup_month treat pre_spends post_spends age gender \\\n", - "0 3 0 0 501.50 424.33 31.00 NaN \n", - "1 10 0 0 522.50 416.22 64.00 M \n", - "2 12 0 0 472.00 423.78 43.00 M \n", - "\n", - " industry group \n", - "0 Logistics test \n", - "1 E-commerce test \n", - "2 E-commerce test " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dict_of_datas[0].head(3)" ] @@ -651,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "01265e9e", "metadata": {}, "outputs": [], @@ -661,7 +199,7 @@ }, { "cell_type": "markdown", - "id": "62adf6e2", + "id": "016ef764", "metadata": {}, "source": [ "To perform single experiment you can use `sampling_metrics()`" @@ -669,8 +207,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "ce5cf887", + "execution_count": null, + "id": "f1ed6dc9", "metadata": {}, "outputs": [], "source": [ @@ -680,7 +218,7 @@ }, { "cell_type": "markdown", - "id": "128c5b68", + "id": "7592fb09", "metadata": {}, "source": [ "The results contains the same info as in multisampling, but on one experiment" @@ -688,253 +226,22 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "bad5e42e", "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'random_state': 11,\n", - " 'post_spends a mean': 427.78932340161515,\n", - " 'post_spends b mean': 427.78932340161515,\n", - " 'post_spends ab delta %': 0.0,\n", - " 'post_spends t_test p_value': 1.0,\n", - " 'post_spends ks_test p_value': 1.0,\n", - " 'post_spends t_test passed': True,\n", - " 'post_spends ks_test passed': True,\n", - " 'pre_spends a mean': 484.9912476722533,\n", - " 'pre_spends b mean': 484.9912476722533,\n", - " 'pre_spends ab delta %': 0.0,\n", - " 'pre_spends t_test p_value': 1.0,\n", - " 'pre_spends ks_test p_value': 1.0,\n", - " 'pre_spends t_test passed': True,\n", - " 'pre_spends ks_test passed': True,\n", - " 'mean_tests_score': 1.0}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "metrics" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "a9c3c513", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idsignup_monthtreatpre_spendspost_spendsagegenderindustrygroup
0300501.50424.3331.00NaNLogisticstest
11400497.00421.7826.00MLogisticstest
22100489.00433.1130.00ME-commercetest
32831479.50527.8920.00NaNE-commercetest
42900505.00414.3330.00ME-commercetest
..............................
5365998800501.50423.2255.00FLogisticscontrol
5366999000490.00426.0018.00ME-commercecontrol
5367999100482.50421.8923.00FE-commercecontrol
5368999200491.50424.0044.00ME-commercecontrol
5369999600500.50430.8956.00FE-commercecontrol
\n", - "

5370 rows × 9 columns

\n", - "
" - ], - "text/plain": [ - " user_id signup_month treat pre_spends post_spends age gender \\\n", - "0 3 0 0 501.50 424.33 31.00 NaN \n", - "1 14 0 0 497.00 421.78 26.00 M \n", - "2 21 0 0 489.00 433.11 30.00 M \n", - "3 28 3 1 479.50 527.89 20.00 NaN \n", - "4 29 0 0 505.00 414.33 30.00 M \n", - "... ... ... ... ... ... ... ... \n", - "5365 9988 0 0 501.50 423.22 55.00 F \n", - "5366 9990 0 0 490.00 426.00 18.00 M \n", - "5367 9991 0 0 482.50 421.89 23.00 F \n", - "5368 9992 0 0 491.50 424.00 44.00 M \n", - "5369 9996 0 0 500.50 430.89 56.00 F \n", - "\n", - " industry group \n", - "0 Logistics test \n", - "1 Logistics test \n", - "2 E-commerce test \n", - "3 E-commerce test \n", - "4 E-commerce test \n", - "... ... ... \n", - "5365 Logistics control \n", - "5366 E-commerce control \n", - "5367 E-commerce control \n", - "5368 E-commerce control \n", - "5369 E-commerce control \n", - "\n", - "[5370 rows x 9 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dict_of_datas[random_state]" ] @@ -949,7 +256,7 @@ }, { "cell_type": "markdown", - "id": "8cb96834", + "id": "55ba8a60", "metadata": {}, "source": [ "To perform experiment that separates samples by groups `group_col` can be used" @@ -957,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "2fba205a", "metadata": {}, "outputs": [], @@ -970,8 +277,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "6c42a3c3", + "execution_count": null, + "id": "a38ff403", "metadata": {}, "outputs": [], "source": [ @@ -980,32 +287,17 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "6155253f", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bf931be558b64216b9ca76eb3eb28d24", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/10 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_id
industrygroup
E-commercecontrol1351
test1351
Logisticscontrol1333
test1333
\n", - "" - ], - "text/plain": [ - " user_id\n", - "industry group \n", - "E-commerce control 1351\n", - " test 1351\n", - "Logistics control 1333\n", - " test 1333" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dict_of_datas[0].groupby(['industry', 'group'])[['user_id']].count()" ] @@ -1100,7 +325,7 @@ }, { "cell_type": "markdown", - "id": "eecf2909", + "id": "c36b0596", "metadata": {}, "source": [ "### 3.0 Data\n", @@ -1109,100 +334,10 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "50e64f07", + "execution_count": null, + "id": "65cbecda", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idsignup_monthtreatpre_spendspost_spendsagegenderindustrygroup
0000488.00414.44NaNME-commercetest
1300501.50424.3331.00NaNLogisticstest
21000522.50416.2264.00ME-commercetest
\n", - "
" - ], - "text/plain": [ - " user_id signup_month treat pre_spends post_spends age gender \\\n", - "0 0 0 0 488.00 414.44 NaN M \n", - "1 3 0 0 501.50 424.33 31.00 NaN \n", - "2 10 0 0 522.50 416.22 64.00 M \n", - "\n", - " industry group \n", - "0 E-commerce test \n", - "1 Logistics test \n", - "2 E-commerce test " - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "data_ab = data.copy()\n", "\n", @@ -1213,7 +348,7 @@ }, { "cell_type": "markdown", - "id": "db1bcefa", + "id": "ee88dfc9", "metadata": {}, "source": [ "### 3.1 Full AB-test\n", @@ -1224,26 +359,10 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "4108a137", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'size': {'test': 2685, 'control': 2685},\n", - " 'difference': {'ate': 0.9805090006207325,\n", - " 'cuped': 0.9764245308837189,\n", - " 'diff_in_diff': 0.39224084419458904},\n", - " 'p_value': {'t_test': 0.20533212744131019,\n", - " 'mann_whitney': 0.08089945933651932}}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "model = ABTest()\n", "results = model.execute(\n", @@ -1257,7 +376,7 @@ }, { "cell_type": "markdown", - "id": "9ae681ac", + "id": "f1e9bcd4", "metadata": {}, "source": [ "To see results in more convenient way `show_beautiful_result` can be used" @@ -1265,161 +384,17 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "bcca83f9", + "execution_count": null, + "id": "2e5ca229", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
size
test2685
control2685
\n", - "
" - ], - "text/plain": [ - " size\n", - "test 2685\n", - "control 2685" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
difference
ate0.98
cuped0.98
diff_in_diff0.39
\n", - "
" - ], - "text/plain": [ - " difference\n", - "ate 0.98\n", - "cuped 0.98\n", - "diff_in_diff 0.39" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
p_value
t_test0.21
mann_whitney0.08
\n", - "
" - ], - "text/plain": [ - " p_value\n", - "t_test 0.21\n", - "mann_whitney 0.08" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "model.show_beautiful_result()" ] }, { "cell_type": "markdown", - "id": "9832d12b", + "id": "cd2d5d81", "metadata": {}, "source": [ "### 3.2 Simple AB-test\n", @@ -1428,144 +403,10 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "ec6847ce", + "execution_count": null, + "id": "b1296c67", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
size
test2685
control2685
\n", - "
" - ], - "text/plain": [ - " size\n", - "test 2685\n", - "control 2685" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
difference
ate0.98
\n", - "
" - ], - "text/plain": [ - " difference\n", - "ate 0.98" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
p_value
t_test0.21
mann_whitney0.08
\n", - "
" - ], - "text/plain": [ - " p_value\n", - "t_test 0.21\n", - "mann_whitney 0.08" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "model = ABTest(calc_difference_method='ate')\n", "model.execute(data=data_ab, target_field='post_spends', group_field='group')\n", @@ -1576,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6d77a694", + "id": "43482b50", "metadata": {}, "outputs": [], "source": [] From 58ff259fab0a1a48274c1ede4523761b799aff3a Mon Sep 17 00:00:00 2001 From: 20810012 Date: Fri, 29 Sep 2023 12:49:24 +0300 Subject: [PATCH 19/20] Tutorial corrected --- Tutorial_13_ABtesting.ipynb | 1241 +++++++++++++++++++++++++++++++++-- 1 file changed, 1193 insertions(+), 48 deletions(-) diff --git a/Tutorial_13_ABtesting.ipynb b/Tutorial_13_ABtesting.ipynb index 23e55e37..ac7a20de 100644 --- a/Tutorial_13_ABtesting.ipynb +++ b/Tutorial_13_ABtesting.ipynb @@ -69,10 +69,210 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "7b655d2d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Length of na_step is less than length of columns. Used last value several times\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsignup_monthtreatpre_spendspost_spendsagegenderindustry
0000488.00414.44NaNME-commerce
1300501.50424.3331.00NaNLogistics
21000522.50416.2264.00ME-commerce
31200472.00423.7843.00ME-commerce
41300508.50424.2236.00FE-commerce
...........................
5365999100482.50421.8923.00FE-commerce
5366999200491.50424.0044.00ME-commerce
5367999400486.00423.7827.00FLogistics
5368999600500.50430.8956.00FE-commerce
5369999731473.00534.1156.00MLogistics
\n", + "

5370 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " user_id signup_month treat pre_spends post_spends age gender \\\n", + "0 0 0 0 488.00 414.44 NaN M \n", + "1 3 0 0 501.50 424.33 31.00 NaN \n", + "2 10 0 0 522.50 416.22 64.00 M \n", + "3 12 0 0 472.00 423.78 43.00 M \n", + "4 13 0 0 508.50 424.22 36.00 F \n", + "... ... ... ... ... ... ... ... \n", + "5365 9991 0 0 482.50 421.89 23.00 F \n", + "5366 9992 0 0 491.50 424.00 44.00 M \n", + "5367 9994 0 0 486.00 423.78 27.00 F \n", + "5368 9996 0 0 500.50 430.89 56.00 F \n", + "5369 9997 3 1 473.00 534.11 56.00 M \n", + "\n", + " industry \n", + "0 E-commerce \n", + "1 Logistics \n", + "2 E-commerce \n", + "3 E-commerce \n", + "4 E-commerce \n", + "... ... \n", + "5365 E-commerce \n", + "5366 E-commerce \n", + "5367 Logistics \n", + "5368 E-commerce \n", + "5369 Logistics \n", + "\n", + "[5370 rows x 8 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data = create_test_data(rs=52, na_step=10, nan_cols=['age', 'gender'])\n", "data" @@ -97,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "bc8e4ac0", "metadata": {}, "outputs": [], @@ -118,7 +318,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "701d20c0", "metadata": {}, "outputs": [], @@ -128,17 +328,32 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "a7ee57bf", + "execution_count": 5, + "id": "a3d70bf6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6d5784cfedd64b44acc349ebd8e55a81", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/10 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
random_statepost_spends a meanpost_spends b meanpost_spends ab delta %post_spends t_test p_valuepost_spends ks_test p_valuepost_spends t_test passedpost_spends ks_test passedpre_spends a meanpre_spends b meanpre_spends ab delta %pre_spends t_test p_valuepre_spends ks_test p_valuepre_spends t_test passedpre_spends ks_test passedmean_tests_score
00427.85427.850.001.001.00TrueTrue484.63484.630.001.001.00TrueTrue1.00
11427.67427.670.001.001.00TrueTrue484.81484.810.001.001.00TrueTrue1.00
22428.38428.380.001.001.00TrueTrue484.76484.760.001.001.00TrueTrue1.00
\n", + "" + ], + "text/plain": [ + " random_state post_spends a mean post_spends b mean \\\n", + "0 0 427.85 427.85 \n", + "1 1 427.67 427.67 \n", + "2 2 428.38 428.38 \n", + "\n", + " post_spends ab delta % post_spends t_test p_value \\\n", + "0 0.00 1.00 \n", + "1 0.00 1.00 \n", + "2 0.00 1.00 \n", + "\n", + " post_spends ks_test p_value post_spends t_test passed \\\n", + "0 1.00 True \n", + "1 1.00 True \n", + "2 1.00 True \n", + "\n", + " post_spends ks_test passed pre_spends a mean pre_spends b mean \\\n", + "0 True 484.63 484.63 \n", + "1 True 484.81 484.81 \n", + "2 True 484.76 484.76 \n", + "\n", + " pre_spends ab delta % pre_spends t_test p_value \\\n", + "0 0.00 1.00 \n", + "1 0.00 1.00 \n", + "2 0.00 1.00 \n", + "\n", + " pre_spends ks_test p_value pre_spends t_test passed \\\n", + "0 1.00 True \n", + "1 1.00 True \n", + "2 1.00 True \n", + "\n", + " pre_spends ks_test passed mean_tests_score \n", + "0 True 1.00 \n", + "1 True 1.00 \n", + "2 True 1.00 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "experiment_result.head(3)" ] }, { "cell_type": "markdown", - "id": "76897df9", + "id": "d9f415c2", "metadata": {}, "source": [ "`dict_of_datas` is a dictionary with random_states as keys and dataframes as values.
\n", @@ -168,12 +526,102 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "cac4e650", "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsignup_monthtreatpre_spendspost_spendsagegenderindustrygroup
0300501.50424.3331.00NaNLogisticstest
11000522.50416.2264.00ME-commercetest
21200472.00423.7843.00ME-commercetest
\n", + "
" + ], + "text/plain": [ + " user_id signup_month treat pre_spends post_spends age gender \\\n", + "0 3 0 0 501.50 424.33 31.00 NaN \n", + "1 10 0 0 522.50 416.22 64.00 M \n", + "2 12 0 0 472.00 423.78 43.00 M \n", + "\n", + " industry group \n", + "0 Logistics test \n", + "1 E-commerce test \n", + "2 E-commerce test " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dict_of_datas[0].head(3)" ] @@ -189,7 +637,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "01265e9e", "metadata": {}, "outputs": [], @@ -199,7 +647,7 @@ }, { "cell_type": "markdown", - "id": "016ef764", + "id": "c4a1cd70", "metadata": {}, "source": [ "To perform single experiment you can use `sampling_metrics()`" @@ -207,8 +655,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "f1ed6dc9", + "execution_count": 9, + "id": "6f1a8cf6", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +666,7 @@ }, { "cell_type": "markdown", - "id": "7592fb09", + "id": "4971e2e8", "metadata": {}, "source": [ "The results contains the same info as in multisampling, but on one experiment" @@ -226,22 +674,253 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "bad5e42e", "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'random_state': 11,\n", + " 'post_spends a mean': 427.78932340161515,\n", + " 'post_spends b mean': 427.78932340161515,\n", + " 'post_spends ab delta %': 0.0,\n", + " 'post_spends t_test p_value': 1.0,\n", + " 'post_spends ks_test p_value': 1.0,\n", + " 'post_spends t_test passed': True,\n", + " 'post_spends ks_test passed': True,\n", + " 'pre_spends a mean': 484.9912476722533,\n", + " 'pre_spends b mean': 484.9912476722533,\n", + " 'pre_spends ab delta %': 0.0,\n", + " 'pre_spends t_test p_value': 1.0,\n", + " 'pre_spends ks_test p_value': 1.0,\n", + " 'pre_spends t_test passed': True,\n", + " 'pre_spends ks_test passed': True,\n", + " 'mean_tests_score': 1.0}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "metrics" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "a9c3c513", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsignup_monthtreatpre_spendspost_spendsagegenderindustrygroup
0300501.50424.3331.00NaNLogisticstest
11400497.00421.7826.00MLogisticstest
22100489.00433.1130.00ME-commercetest
32831479.50527.8920.00NaNE-commercetest
42900505.00414.3330.00ME-commercetest
..............................
5365998800501.50423.2255.00FLogisticscontrol
5366999000490.00426.0018.00ME-commercecontrol
5367999100482.50421.8923.00FE-commercecontrol
5368999200491.50424.0044.00ME-commercecontrol
5369999600500.50430.8956.00FE-commercecontrol
\n", + "

5370 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " user_id signup_month treat pre_spends post_spends age gender \\\n", + "0 3 0 0 501.50 424.33 31.00 NaN \n", + "1 14 0 0 497.00 421.78 26.00 M \n", + "2 21 0 0 489.00 433.11 30.00 M \n", + "3 28 3 1 479.50 527.89 20.00 NaN \n", + "4 29 0 0 505.00 414.33 30.00 M \n", + "... ... ... ... ... ... ... ... \n", + "5365 9988 0 0 501.50 423.22 55.00 F \n", + "5366 9990 0 0 490.00 426.00 18.00 M \n", + "5367 9991 0 0 482.50 421.89 23.00 F \n", + "5368 9992 0 0 491.50 424.00 44.00 M \n", + "5369 9996 0 0 500.50 430.89 56.00 F \n", + "\n", + " industry group \n", + "0 Logistics test \n", + "1 Logistics test \n", + "2 E-commerce test \n", + "3 E-commerce test \n", + "4 E-commerce test \n", + "... ... ... \n", + "5365 Logistics control \n", + "5366 E-commerce control \n", + "5367 E-commerce control \n", + "5368 E-commerce control \n", + "5369 E-commerce control \n", + "\n", + "[5370 rows x 9 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dict_of_datas[random_state]" ] @@ -256,7 +935,7 @@ }, { "cell_type": "markdown", - "id": "55ba8a60", + "id": "e3a32245", "metadata": {}, "source": [ "To perform experiment that separates samples by groups `group_col` can be used" @@ -264,7 +943,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "2fba205a", "metadata": {}, "outputs": [], @@ -277,8 +956,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "a38ff403", + "execution_count": 13, + "id": "b5896bf8", "metadata": {}, "outputs": [], "source": [ @@ -287,17 +966,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "6155253f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c47831b3290c4ccc82973cf18d023597", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/10 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_id
industrygroup
E-commercecontrol1351
test1351
Logisticscontrol1333
test1333
\n", + "" + ], + "text/plain": [ + " user_id\n", + "industry group \n", + "E-commerce control 1351\n", + " test 1351\n", + "Logistics control 1333\n", + " test 1333" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dict_of_datas[0].groupby(['industry', 'group'])[['user_id']].count()" ] @@ -325,7 +1086,7 @@ }, { "cell_type": "markdown", - "id": "c36b0596", + "id": "0bb6fece", "metadata": {}, "source": [ "### 3.0 Data\n", @@ -334,10 +1095,100 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "65cbecda", + "execution_count": 16, + "id": "6f5a8a1f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idsignup_monthtreatpre_spendspost_spendsagegenderindustrygroup
0000488.00414.44NaNME-commercetest
1300501.50424.3331.00NaNLogisticstest
21000522.50416.2264.00ME-commercetest
\n", + "
" + ], + "text/plain": [ + " user_id signup_month treat pre_spends post_spends age gender \\\n", + "0 0 0 0 488.00 414.44 NaN M \n", + "1 3 0 0 501.50 424.33 31.00 NaN \n", + "2 10 0 0 522.50 416.22 64.00 M \n", + "\n", + " industry group \n", + "0 E-commerce test \n", + "1 Logistics test \n", + "2 E-commerce test " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data_ab = data.copy()\n", "\n", @@ -348,7 +1199,7 @@ }, { "cell_type": "markdown", - "id": "ee88dfc9", + "id": "690ceec5", "metadata": {}, "source": [ "### 3.1 Full AB-test\n", @@ -359,10 +1210,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "4108a137", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'size': {'test': 2685, 'control': 2685},\n", + " 'difference': {'ate': 0.9805090006207325,\n", + " 'cuped': 0.9764245308837189,\n", + " 'diff_in_diff': 0.39224084419458904},\n", + " 'p_value': {'t_test': 0.20533212744131019,\n", + " 'mann_whitney': 0.08089945933651932}}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "model = ABTest()\n", "results = model.execute(\n", @@ -376,7 +1243,7 @@ }, { "cell_type": "markdown", - "id": "f1e9bcd4", + "id": "05487531", "metadata": {}, "source": [ "To see results in more convenient way `show_beautiful_result` can be used" @@ -384,17 +1251,161 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "2e5ca229", + "execution_count": 18, + "id": "9dd905e8", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
size
test2685
control2685
\n", + "
" + ], + "text/plain": [ + " size\n", + "test 2685\n", + "control 2685" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
difference
ate0.98
cuped0.98
diff_in_diff0.39
\n", + "
" + ], + "text/plain": [ + " difference\n", + "ate 0.98\n", + "cuped 0.98\n", + "diff_in_diff 0.39" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
p_value
t_test0.21
mann_whitney0.08
\n", + "
" + ], + "text/plain": [ + " p_value\n", + "t_test 0.21\n", + "mann_whitney 0.08" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "model.show_beautiful_result()" ] }, { "cell_type": "markdown", - "id": "cd2d5d81", + "id": "ea252142", "metadata": {}, "source": [ "### 3.2 Simple AB-test\n", @@ -403,10 +1414,144 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "b1296c67", + "execution_count": 19, + "id": "0ab77779", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
size
test2685
control2685
\n", + "
" + ], + "text/plain": [ + " size\n", + "test 2685\n", + "control 2685" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
difference
ate0.98
\n", + "
" + ], + "text/plain": [ + " difference\n", + "ate 0.98" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
p_value
t_test0.21
mann_whitney0.08
\n", + "
" + ], + "text/plain": [ + " p_value\n", + "t_test 0.21\n", + "mann_whitney 0.08" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "model = ABTest(calc_difference_method='ate')\n", "model.execute(data=data_ab, target_field='post_spends', group_field='group')\n", @@ -417,7 +1562,7 @@ { "cell_type": "code", "execution_count": null, - "id": "43482b50", + "id": "6193f7f0", "metadata": {}, "outputs": [], "source": [] From 197a952a99d0404ef842378efa25851849f6a6a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A2=D0=B8=D1=85=D0=BE=D0=BC=D0=B8=D1=80=D0=BE=D0=B2=20?= =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=90=D0=BB=D0=B5?= =?UTF-8?q?=D0=BA=D1=81=D0=B5=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 4 Oct 2023 10:13:38 +0300 Subject: [PATCH 20/20] Updated whl --- dist/lightautoml-0.3.8b1-py3-none-any.whl | Bin 391997 -> 401051 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/dist/lightautoml-0.3.8b1-py3-none-any.whl b/dist/lightautoml-0.3.8b1-py3-none-any.whl index 4f1be7056785d4331039ff60162165efb75b7de4..bb9f39d28c05cc1d0f63678b109dac17f9f42068 100644 GIT binary patch delta 29795 zcmV)3K+C_q_7|JI7_h7ildWY8lh9xt2|+?sWpi|CZfBFO*BlLcJT-IQ6953ASO5So zlhI%te?dZ2Wpi|CZf7rHVqbJ+b97~LE^v9>TwjmdMv{M@Ptm7fG2~jKI05cu?i8HY zj)6QRXC&U;2i-uR$d*KyA{jR6iQ@BjzpCp0>?Wm|IKbTrK|Er2S5;S6|EjvWsPfb{ zUC$n~t}g3?xY>8jiT&{I+#j2IRg|KwvNJ1Xe<$@-_Q{*7;(1>-wb1Q#cH%`pv=x_K z+N?iT-Sk7xyKGnSCHs2~443RbTV$M7 zOZKMB`z3o-pKsK_UyIW!8~R3%#RoSx>Pwr|MJ5>h*B1I&J)GJzKB`+KU0$~5mFTnH zf5_JNT=d(rE|%6?!dQ)t{Ga9+%WZ_!SwK6 zPGF-av}udgo2<{?cA&b4CG)=;0(n#ST~j@aMtV_sY(ae&s~e*E+IvX!f3gzHq1m$qa#;|Gh2pbdAG2!ULJfY^9mIW9 zMJNH%=rfSFm*P#SLihFNA{qS5`=P5H9<(^cwtyk`AbXu>u&jlhu-4O!FVL!L^3DBH zPxFDT)$DEpd177l?4?kS4A2Nmm6IIF#`0VGG+{-{L)~;7&5!@Ke%}o|mBc!lf1yhg z%rLJqA=s-|?}4D4I~wK|Mwk6|YmE{=f>-;c_e3hkH6g!UuE};^@~RMlC5pZSaq0iw zsIuNSTQH%y5ulEr!2bM!qycXvbX_uUczooqLsly}?~#7A)2S)AZe0l2>lK%)np&$t zgD_uwqcsZip5W()x*d9Io6%k{f9fqhXOCrn)V5DMCKNds`7=M1uww!Bit=MwaE1s~ zaoEu86>Y-5sV5Px__>vnzh!rdfMC0`0tP^oD$5a#IViPa*{K=oK1v2QV#o8QD+EjO zs^m2w=znW`!Cfk!rHD03p13%>tjg+~7bAg0`8!8CYOIxPKU4pe*{`_in(M6`zA&^ zW^KxpjL}&EzVdh|dZmLRWQuCtaOg?~F2a&1z`1UBWepNXv*;P#i1qN)84T z$$GV$>RW;WO`z~pF}9r-Y%^Z?2j4O7Su!3}3c?%oRw;o{5*Ux$#iK{&HzGJ@IKwZb zby@y7;X-=e)6`kle|f>I#X-Nw3P!s+rUU^Ef`_t}ge;@znl@1@ybfumOKGgzI#V%M zgf0=H_$x8ix(R7Ph*R->??iA*)|H22@5OYkqOP4qAf%(xefb-yOLagc!Cn;o`la0r zl9b*Cg7=BLYf^B(_{{7oDJ0)Q34fA9!+*iqc$85Jf7A?od05p$UH&@oBxV1; zX5T!}&O0^@7zfcrLkc$SR}5FnbWkv&x#K25AlDBZAuF-%n=K7qk7RzKHkS1s+6J8` z)~?ikcu0Lrb`8O^D3`Yj(pwp7Z4ur{fMqP`W*tnbIc@P23L$C_OJ_c)z0U_FzsML}Y99egc`YA~Y z7Ut7J=Lc}u;vURoX-|AY2GINTZ)a;o2Q}jC?tMqrZ+Jf;?EjNz!QuaX7umC0CjMFP%ngk(s z_Z?;4e>^aHqYi5J?AbpyYUj^xb)OAlEwJJT-XRvD7bBUj+}F5g-91c^sW}p%P|^I? zb%ChQ?2qpJsl%cIOXXGQ+wMG$6MdsY&%0f+13E%AK0GgKzI4DqzXpAig+6O9mL><1 z#pfMkID87UTb3ao!4xFOQMlK`nl&p%zZsQu(cCyzvkSR@*f1X;+ zf0TXab?^;mViVJ3)QNmSNF&_is0dc|t(}2WefHzC!Qp~b#8_{F-%O<^+5HysA z$~{FZ(IXTNla3yO4RLH9*&rbDz}U@Ze+MaEDBze=sML>>!%F8u8()ZMY$EI-Y&trb zkA37>NfF`&Wx?c}WM0m@r;A29U$HNs2V?9whxI5oOm}0hI>zg4jaE^h*`fPJFdDD?k-dMwMf3Wz|!qpYT>F!r2Vy6R$wnuqFileo6Tu}B3y{Bz` zH#Hz)*;njVvUYDI!R{-?3@9+YX)RH*!C5O`4<|Le|*F4rlt6L zl3&XH%D#E|a+X!UC5&vhbipfS)E1IpEYR{Zs{ZEuoX6z;`kJ}Io~w~lSw z;0RZiI*TpwCAsTXT9uS!e;n%&x+r)AV9~VTkZrdg|E=X+i4i6AnAnw{FK28B&!IEe z51gVfjcZ6;s`rRFGd~E&QVXi`(r)dWmwu0Z31KW=nduS@tn-?w>3Hmb=GH5{!0Y=( zte5C{>$+3FPrJ*yd$wyRRG?cGYNqUzt`OUh_uBZ$bwayW^;!P)fAH|&-;mO6gz$U; z;oQEIbp^-f)Es!tn*meAFo$wCAh+2hm!rtQCH^#=c7Wa(8Jb=M?6k5>EvgAfY!f>1 za5)3|Um~cm%N&Yt>JTcbK@r(UuB=qB1defZe{u)sEc5iStSY=m#jeGY?T0#7<}Fm8 zbjoM~_CI1Y3T&y*e?)x?S{N3SkZ3SLX1Cr%2}6@YHC*XVK8ggj*+|@wUniCZK4$<^W3}7<9c5De7g76hiHB)nV3Xwn^Ps zGg@MU6qe4?7^~LF^kGWH(;iz0;tS}cJqj96Vf2}bxgzpEQ*o`*^@=^2{ z!nBMy-R!QlR-0y(7{?Zw`lmjR)2x0LW54*PRnF}vOEQwPSjj!0tX%J{0UhkrHZrj5 zhjJwxW$2m?%VrZ^4<}R$_o7|A8t!MwU$_pSvV(Dn$VIsy_ zvUs;~%ruIWeS^acF4`FF2>2tBqaxWrW!^f3@_DD1Sb+UX7WMg0VnOElWSV2525?B) zIRwShE@V8{hnCfeB~;Ezq^rK!0^%eNzRB8-w;5nmf0$UNynD(1;&4OWFTWVeB|Gx0 z;GO=4YF#g~#oWDwf;tzEqk~HXmA9ZT(u#f2`ug-S@_K!h&F&^A4Cg52^6;D(Szp z*fINkx;?dbL}10UYtyyI)*)3wrErH9eLYLbmdpv?U={ECzRo6TU)daEz zEFIbLq##ExebQ6Jg28$~yi(hnJf4F*e_`ZnQZln8i?^Ibd7v1NdW_x{GOtX{j?|r; z!8Yx$T(BqJ+{PjJ^W7|^V=ui9rIYked2Na|-*odN$&XUV7YzT@_mAvrmVoAT6aHzgCdahq_{d2q7Fg!|F>zJ1jAu#4|4*tuO07aL#}IRX@I(tq#MOuci|k(FBj?>nr?~Z z_0>{A&64Hiw&L{xuN~+;iyeSuLCTSDSE`7U-?(_eR6m6dmZ|Cy*_8lQFbKEeh^C}Q zd(@%h7VV7jHpE$$6&UGwpU< zreQ^iKXfhzaSx3DmqL*@f8N2?e!DubkOyj_6+42LvMy%=^2TvSa;A0J$}V^8td~YU z!MHF0jui>p+DEqw9B_q!FX6A6RSu$hmx}Um37Gs5+^mBaa@XowN3A-r>2Y z`O9pd5Sw9k!fe#>T+P9qK;#bvY)|0x1){h)-CeQB0lgt!ccs2XtExu{1HVRe_>pD!G z$c5%n23oG_ZA2KlPu6MJ)qmCT_iPr*iBrxzStHTAvc4l9oq{a2zH&A{j}hfHTPgnp zrqVw=}_(mn5d{45HrWGB2Hzr{6P?u?>f4U0sITMarmzuI#byKq$ zNoV25_`Y>i31JrBb#vn}k^GzC<0xY7ewpv8Ok^E-=Bx2p`tx?{gc`jaW3)LPnE(G& z>O`5LdruJwF0KtI!XV;#ae=zMp^}V@9mJry*uE;I3lhBT<;5GizF~BIj9OgWkDgMCx>&~g(yV-AMbw}0Q;cdb_fSBhNvS@Xhw4Y!wJ9jphAJd5DGO%-+4^-(>5`lW1^ao zZZg74V-R%}mL9@d0uTS2WYOEl$8110XYkHD8)yxr?5?}1VW*Ehwq@cxlEV*xP3>z-wlS_!1kHawkyTjP)~pQ)jk~=xKMZI z-9G(&DTmzxEt%f}-rKr;aONwhAD#EApFbDtRd;)F-d{#{G`@9lWXg-|kW5uLPKum| z<+-$WW0H(*=EZe^cREcs5=gWAKq1AL?8zA6c6WH;T&(W{YgXAM##_{i_ncmWyLYdh zP~G#*5_5lSBqxz>E)@)I@p{E0Yo3!B%hlv~(kT{hu>_UYwGK%3o;L7%4&=mJU}DFS zgFS%Bd1r6TtablZV0&Q}2G+6N-TN|RYk3M7|A}aK>$kyYor)7@!x3GdB@E?EL#Bn@g5tV%lxIT>2a-(-FOt$Wx zaDnO+ST_mR9+1Rdyfwac>?A3d-svtV0Vs(%l8D|)xs1AfjhyrIUKjnN>WplR8zgex zWOcEm3-0jwWH$ImE-a#C23=bp_Llzfji=VhRZ<}S7iQ)>t7d;iWL?AFwaI?S9m$`d-eLixX`yQKdF!!(op#2u^{a8GjJmBcrXrhnj!k z_c35VSxW?EwEi>_eR8L zba$t;%WZ~Pn#^;iFIXx{QL&VLA#V$BBlPkP#b(2>H@xO@he2;!S|+`>+n~WVS2;)9 z@OiLSlcspX@rE>1G+x*ExIBd6 z$A_#)=G6_a)$a%>6Y@yISNQZ*#0!PE0Z0(936R>wX3TaP!ITh@d)O&jhquH-jffp<|fF%uJCwh%TB z$M8!J*7dx=S%B)ujLS&E!t-3M&0=@~+ck}lw6ic_s#q~{P3Oo=d!%3sFv6?kJU4gt ztJ%w3&4%c-(E9Gk-Eu!J-X^Da`5E+jbfxTa&a&;egGqs20c?4A8eV@bRtu(YwRc8a z&+qc~dyd5bSM8Mh+d-7Pdo-yYd zUFYNr*_i}^=`<-Awn!Vd5p`pEL~Va#(B)#peyI)|!4n#?H^nkibw(C!b!^4Wmb9>D z-)kF5x;oJsr5&o8R*-+5g@C^ywC5tHk$Z_dDGc2WHg@#NJWq@!owwJQ!BPk$NeQlf zvl;jbDSLWZ&3SXRp^s=RmQ_<&8ysBa(kQyZi)o!^H3je5-7qlaZBN!ouAV!Xo~r%Z zcYqYU3Ea8`TwNk+ZkJZc>zMs`WA3(wCFW6|KB*-yT`FaIcea0?I=8X8_hM1n9p>XD z_!q@BpKl!9T4%PG?FaPNqIIh~@%gN}rY-wm^5sSEiQ2LP`Wvl5AJ%&N7+*HUoGt-9 z;)j2y8;=pdKb+{dxF<$c zjAH9y2rlN$F`IvuUjG30O){6N-ySVgAjgy*P$wcmvsH0?#WzhflovH0Ij1RAZq;lRasb- z#g@vpIob$$w`W!7ygnH`9Sq~Np{uRU)xfZN*<`zGDT{xm_BE~(++-{_HoCLC+~nw% zsiESEDxv8Y#L%sfA6n4NlPfxdonw&bm_@t8U2+Bl!~9U&TtNvIVCZD_)zHNrK4eeWLwydysVJ<9 z!l~$fZ>N84AT`8?JfgEpEA z@JP_>>IUGP(CI~RO!I1XqFidMEUe06M`dK7?BJ1Xiz@cSAsmWbT@&}$wd|u={BNw5 zWxKS@u^Iat^bf3K`pgANfUibbN9kIP)LQbNlp)DSxsQ&}=?EPa3QajZEk3qNw1+|71Sc7aVXw@E@%Zsj z{F2maL8lb1IYlz)ntD94V7?fM5u6;)RKE!d=bLWu>dj%3sgGshhm%w|R(CIN!Hl8{p9pMB<5No_F?= z0+RZM>c%q=-LKd6MbV#1K^IO$kG@wbV-U?sev9A|kESn|JjLU(_$+%?41X@aTadv( z&{>1kTof3JP}q%JSXDXS;Fqdkz|&e3IK-HB3y0xfc1I70yTGd@XYJQR^_1GH^qPN9 z4lp-5YF(hG<7gCQ)&Q;0iDys(JI0@55N5^Ic3gQ=%dC=L`05GY=x}Jwmw4$ko{Isd)kbp-;XwAI9sd6U@NepQ+;+VL3!P7>tv)8K>160xHFV2Px!OV&bk4xat&=tmD`D!jL zXKikISGs4uSSc@ZrGafL;f6x_Jtq{+Xjwt&6m;3>JMm=5IJg-yeppS_L2iG%y&tm= z4vUJ?*_6}Jhl+z%;9lx$Dg*brRwU<34)uUB)5K1yXyuM{!2^3nTVZxXfpilbz9@;7^}Q;V0&JHin`7l(Y%gc zfox+obBqlEZeEq&kvS@>3uKq29N6d;Dg-G>KqY)JM-=r;B|omTT-McE9jL?)d7wO0 z0>7x3ujF?ipyJDI@j9~I_=ohk!dyi_f=1h_?h-oYjymU(8a{5Hfu8fBljoh z)FvWI{#5zHqo8;Mt}A~&-DL=KgI8}8-GAHY%58BabG~4}ak^=fphQE188DNb?QwMc zq%V%o)pkhskYhsK9GOv>(reNQ#_7Z{nniia(N4ERr0oR!7_8gvuSo+Aykp_8D6x(ELcP)h>@ zmx03t8nf^#$_5Iihy!u<7XSdpTbJQP0uz&};y?6Z4PK-SZv{D9RUCFMd|*Yu#@ z?|+jD&d$?jdCzOMEK(sx{eHifuQyfQu%f!V%ga0cbCWMW7QCl_mfQ8_iKT*-8v~J+ z88pCuo2=Jc)zzARH9uwRQMzp^reLr^y&I*)T~+7J{aTDxX)eTktw>R@C)lXSYj-KD zdCfOPy5wtKHnJ~}t2?WH7uA&oQb+_Z__C>LF)Gq^I$!a$+17lnn>B_rczGh%-DRr`Fo_b(i;7m+$B2M>ogc zO|ODJO3M0wy^Evs>0H9&YZ8XM1WMColRxmegdg2E>jHr1H{V@NkKSDa=pp<%xjdui zKlZMUF5X?9&#!NeZl?J1?H_tK(`z93^xyFJ!RT%8=Jb4e`8|&Ej|T_6AEsB|Twde5 zx4rMLr}Lw;Z!ZzmZNIo%^k+Sy{rdRwYI;3CzC8PX{`>;T+tk%&E|yizMZfp$^kRAi z<5}`=>MieGOuD+e#5V_ai0hL6+PV>R&;H_*v`q69t>dRgabp;6WL@SNa9dFL4E$QDsZTP4DpofB56?_iT z?*LwZqcE+%1Hmc)c+k`BAnp0`xGGoqUD9LlcP~N2BykW82Sa+fs$jb{lm6|iBo(lF zYd#R{KiI3JOwD7#A9x{u&&Ke3&4ozs;8$Pb0CD82H6VGFjb5!sul|+2`fmK{eEjNK z0T6>fRvKv)Cy-zprA&NsTBx+gB`lwKW=3e=2- zs>Lav$=4sz`}ZNFu=AS6Kxz@q`PijgZ?J$Ctz94a>E< zOXW0|)fOIL+^+e$>zS9C9#>DOt*j-(1H$xXcne%&;%NjPf1YkggoHWlsIx^r63I^=bbmU5ZS6VE$ zg|u3WJQeOMG$lxx#*(dA^4C1$_&}4`qFck*X8*v*+04sT<+U4E0!^HM`OydXj+5YXkY39WWU5&_^mo6fx3M6rFtK_6rEU+@L_Dh1ZNk%$Kl1&|Cu> z>Uo~^hwL>l7dR(g&&*JugYHw*8=&+-Z-N|yeqH7pU^41^(jLevk^>K zOxkK{(Ef4S0RC`B$asN&BG{0%N$Xs|G9}oYGSmR*HZQ@xD0M#AF>h^^yykV4aU5UC zOM+59WUv;fyeE>EL%W(2^E5Opo9Kt3ozcWRc@$ZzGm#_5JoRJ%#Kw&rLVNhs4m4YG z= zJIrNeEl6H&w#^**J%+ey^bQ_QNWB;c!|O51fNfW+L#;5$i>lv(?lVV zU63PxJP@kxoR*79o2S+oXZacezsIWnNHECgCHNrh?CAW+9|&_$Zcl5`T1d~+Px*Sg zM$uiMuEQXaKz)pd0X}K{TG%cO76OpTk-8G%jq;N~Kzzi1^l)gi*|~OL1|Qj`u9o16 zgSk)dLLR^Mx>_q6;ijNC4ZUlD56DJl)6YbI`zmxk{lRoT&&3-z-0eN2U#SFAU2_$m zlC_370z)R1K2CJHd0@uJ;W6pw;*44S*1+$j#ie1<;4^LTJ=B;vbbIY`4LTRgLyu~1 zxfKwoGTT=?v;6{}qAk7PS)=q?nAW)d2$bXWM(~}G&i(O%5x)8|*eup-*|A}4A_kCu z$ha{9^|?yHuYRhw(DVdi2V^sC2L?{TjQen&?Bm7Sz8XB zm~v!|QZ1-<4}F!ZT&@=Ozrq&cm;fdVknzR#j>07mkX*91j?cxW0QzB%-(kEB=7q9r zz<2aK@-pL}T%XOlyeqrgwS^qVl>xYafF>*N#bhJv#{rD};SQqwrT7r~<%FF2WZ+|0 zZ*hEsWktSKqJ9t9M0RBy>>ImP%`+d5n^LhSH6DnYQ83TpTuq`wKSs$V&(M-gKYc*$ z0!_3t6wu-uW~9h(@xkLJ7kb5C1gw$96B#3){MW`10)TlAL8R%LXe;NxRu&3>UaKen zwJT%h*$0Ul2mp_^hfCH{@U(iZR=*o=Jx9qwmmTUn=;|q8Or&@X0kEe_5!h?+fy2Yj zN$7X_;M%2}>+JfHtC%^YK?ZHHcc8}RPM*Slx!L))~ z&O*7f7GUBUjX-)x<)XOILof$_RF)Ictp1Dn1~!lMnjrEZod(LuW~i3(bZG7*66Z}d z&$jFJQ-T=$Kv^9o2`};Zkk)y+DC9vB^|Vb+@T?N*zTU)~mo!6hjEX)8kYczU%366V zT{LuGKOppf{A?t@jk?2Cwhj?P!D9u9h+ zeac`3CXA#mQAj2MGHaTU(0$O^G!gP&8_7+EC&45sITtExOzLHo*P@xAOJ~QbY4IHl zjtoXbbWh21!p(?y{GXJHmk9+t`wqkxYFwj|X0F`C^sB$ZGOq9yl3!GkyhxY^9`;U^ zw~|2>I)x#6lQIDqVi)>p(C+f>wn!P zaJv3(!Vbo5GUwnPC#${<04~xC>1Yd_cmcYO6yndk#w77C7USpsP-*`Or9mRh-^GIw zVh7p}2ECThM1|HT-WC3kyWwVNOJ!_-g{jnViyHJE21m1YOnJ9|RU7=8$+t%W{RhLq z2+3{m(m4d14H<-yVsaQ^`STok*`S6$JH~vmZU{@zehckTZq%fBiAQv;$gthgiES7K z{Vmi_dnA9dFWj5^>8HTALq&q8#%+6u#uoI5HH&@M=}R}8;z?B~(t-}vl;>3MG-SV5 zKOrU&z8OA(t~n-u9(0CE74+ohtYhbKi91XzHO6JFsxDcvV-U%H4#Fj}jSI%!qJ1Q% zbd$=uAqS_diH&-Y3U6Kt9c{Q|CmwTlsGk!J~|7*iyR_FR{2RGF)72d=1qSqH1LlzW-kK1K>ysX7Wo{%I0&se~_V3eS_9mKQ9}j zyLpYVUS3*8_wpJ)+dhNP-gw) zZsK5cIACwS_UdfB&1EWf779tJ+MxDNzf$ zqsZBRj^v@Z;O11@aY~=d;pi&n%+@-@xYuEAW6n@li2LJnxemW$+tYvE=f)v^j3CuL zU8fq4B%D&S*u9kGNASjnUbcK7Qv~eZp||b~c!QwS$U?$oM?fMGiYt0t(CMBYzUfzz^@` zY*daG|CjjgH5Fnbrq)Pp$52$;nw+{&kdQe+C=x8pNZOmh?Dlb{o?GZmfocqWh>zwq z0{31X8tuYMpnx`Ah(t}{jT&sg{_M67*zY4AQ=65e?FQ4<$uEy%+4^}9VY-e>a7NpI zb>eiFgeC)dbj4L@sD<1^rR1<3#kR-`#d9yVNx}B%jRa_a&x%Kf@P%|&VzrY2ziPhm zD7XFhcg%OzO5M6su*Nr5yi1PjPFt(1@>Q7Ooa}%#q&o|y9I7ioiQVC&VxzSD_zY5A zqaB7T7=9WJmFmNGQPJ^(yvUK)E?yIV#84rVz5Subp+xvI#?u+pK@D+`8kdgI1-L(S zL4<0mo{NHxFvo5Ck91l#U3cN_^`I5JP4S4D0!Sa zaZh=D2y>o0@sD~#C_2>DF!aNRQ0KZ6_qf-G2#&8Ou~TlphdR2>@_Mr4jN9*jV~&p} z{>ie1vYRtowM|VtuW3&qp7)@f(*0lILJ>{bYCk8+aS0rzm}H62^;b!@nq;eIav| zq*_Y4i{KGzlex0k?&#p1YUQGT&lGG%Le@*py&K#r!h+SHvUO(5`=}u}v?BzVrgATQ z?^9QuMGDw%l+KnBhP)Fvmh;;PNsFungPst95ncRfozo*Lu87>%%C2KX2kXeLjS} zYSSTBMQ9OCsUdCw zVx|=%j7Pu0RdCsdo{Cg|CF-)aIofQiku$6(Of?`&8{!_|2Ln^*n2VKxZ9 z8Z&HupliQUN6-TD)lk+u(rv&6m&r-*GGw^2vXG}^MRk!6HzhyqzJ4EhL4QQWmgeUB z4gpZ`NTLO|3!`r-SU0}edKX1qu{Kv4@SHFQ-&BkC2DUm^zPCz$F+Jl5U#Xk4&RuJj zIu344s0+C}yx^$a3Um7Yfq@>SA0|D`^+NAfVRX;=9TmCidSJ8!Wa`@;=7ymExe)LS_-cjzsdtSK(7~H>4!hdi#5x$V6@6F(FZ$Vh zt1N~l2|}gd3D6IJiZsFW&CCq%d9&mQ{BGQdHzp-IB`OAv_{2I(THCuc_IEhIXijD( zaqVHK!pXr>R419iD37g9-!i}}H*E`}twVy>`c9nh$mz}9 z>U%9$&ny;y-Bp)uYHnl>pQ{1QsdrjGsk1i97trN*Fi>eZx8maXeELHWgQy|U9@Mi2 z)0zOsMFTRV&g}1NNS{UH8waQ4_?M4kb~}^amhrW;&;Hr(=9Vg}G`OWtijBrfntIGY zH^b8so(}1$fv2~hZLd@aUN^4HAq@^LQ8c`iVqN|B!iZM??HAeVKPSTVWFZc{f3U0nY|#5$30q5|eX|vj=xo&%V87FI`^vhK zZP9+Q9EIr+57GaN&TlF@vAev=VDh@I>ZE^KQu)==`U+_B`|WkFVWBy@fR|J18KtcQ z-S61ZLz)+OAwai_am*!)Ic|ZQ){jg&w0Qe}q;2qTkJL>!AD>O(`0m?F@;HS41-TqO zfVtKfI53eck#SB z`ug~5@)uJiNav}J=LGZFV4`7m+gGh3{~4T`9U7)Z`u=6bFLR8Zsj$IYo}P!~xV8-2Fhv&Pxw_p4hpoWfPZ9HdCYz z+*Y`A`b4O=D%bkkDe_E;O3evWrRn8=r;$6#5Fm7tUkws23S`S8-h&rdvuY4NQx;FO z#k12C3EPTzN>kGS@DQ)5lf}%S0~3uIVy(wTe}Eqh2u{}2472T8S5K}K1OGzx?sBe% zqeYcw$+8e#3_XMQSYoW0L4qe4(7{A`D_b$0KAXtnd?i$Q^S{eS9Gx&fU*xD44j?Uc znWAj}C8*;|pu||M1aTPF;C}$Kt4bLb3g<5xzkvb(09_7~0cIbQ?O+RkVPP(CdF@oq zYuhjszx%Hc${{ucQwqBbeCRd?JM1!c%Ysp@=U7aZG(DNN|NWliICj>Ktgx~bB0`Yg z$M1dWr+R07U``k*kkQY{to62MTJ5_ahG2VL3Ms9@vU_lFU*5d?3K*2xm*Tb#bOq1U zh;nliRcHK%pfr|&G+qgRT}u%}-FOf|Su+8pZ%F9LAj>idElb}*)JSF;^5SjA=tKin zJ@UytR&TD0WTH%K>&9v_;{$@PmE@y>vQLqbRM%Ho682U?G#$&ylWYo9#lOu_6olVH zs9ObEVzuSYLyZoMM809%S<2<`xtpD)Y_OVcNj0YG0XHcU)>1Wpfzn9nlnBB%9mxoC zFsik_uTfARXcGBcMm1lHu^NF?a9|3R!5~Znc?#iV`GKN*^rDA?a{Ua!tAMciP78` zXWx!ZI&*lwN+{TW*+ENo3k-e?LbH`^8QeSYss|HJb3blDtWTLkW79wx|1>ET(p0|# zJ&EM9K-*BYi-6<|0c5nHu{&=Ys*V4q=kr8qy7V7g{snN*(Q0fsNl&~rn1x&MsM{+> zwa#Oja1MQl%Gvw`SZJMqODcEhX;C8dBpjpBbW7>2%MT`h)pou2c5v%marD|Orp4cn z!q|+^L0pqq+O$kjgmE8;V4B%d85M{F#uvxCXJ^zaJl;S4D}W1P=A{0!T?SllORjM{C&$PX_~W+~LjMNzpY-U0ubcy3lN zN~sq23lUx;i10#$e|r8o5&i^FO9KQH000080000X07s*kz%c><0N)6c0cIbQ?O+Rk zVPYN? z=*gLnUuNb%ja#iQAo%HQX>ZjJ4eZQT(y@Jpw(g~3JI zyZOE4!j#r|G^e6w)Y7`qNG(@#1chgR4D=8{2zda8Ed-0{1h+LkZi-LN?0-H6s#j^7 zBKpr%v~>*P(r)~*G-ul3{^J2Yz^+I@`}IkyzFQw$@P6%K?)WZYym#hCNvoB}J>(%` z0-$a=3g#e@H+e6Lp>XzLTvL&uEsJa{G=!B?v);N*$;AYu#-N)TH(VM^WrLf4jI{;E z8(VB)rP1`lf@)#Jx+IXwhKN;;6DV^|0aXP%hnbT76sUV?Ksf3SzERPmnUlC{GeebM z@x^S4K#Viq!gZ+b9nG1_6Eg6DPBVC2`pWm8?nOgIQ#0PSsL>DMe_M~>vs9nZUY(r^ z8;_Eizp+6>Thq+Ulr2+6?c|d){R%r}z8nduv z-EYep$2aOTjwkrW-m0L_WJ)Wk(ud`cSZuXmyCx)MSIIf}Z|=fS`_d{pw7qO#=q23X z=`zf2lOZX5^dB=umL9+STc6^c9tGd2fyW%^FO>n2-zJ9tK!2NDKP&zzF6;V0egd<$ zV2m9N$scN+?gjt=L=^x4IFr#}9Frhs2A3qr1sQ+MSX*z~HWYsMuOL)lky=NMTMQe# zxQ#$x%d3n}&%Z)kdXAnq{-utY9V4 z%xnb}I-L>!E2yDMP&NMzUM>}_*?d?NbgK<(mQIUxuhp6hvzZh84?4Y(A|esny{z;J z4X6S%_}|mavZbX;lH`(C716S0jORvtf$Mf*46^`iY@Hk|Hs#DJl*ui@3Zh9fxzm5tRW&>3%O& z0Vh{=AfR9hNdbRdeFaoh-S;-kFmx*2B_NU_-O?f5BB4@}0@5%v2vQQ4E~Sx@4(aah z4(aav2J!v>-uGjzS@%BA*?XTI=N>MsbI!a*bN>*jML$67=<`&GySKp0Wc2Yjl_&8T z+^x`|7LsP|12T>ukLaQPr1Iu;Jh?X-U+1ZX3=l*5;$uv#SBm7gE&Ap9 zf}S8kat|k2XOlwO`xv88V=LEQk}#g;OYu{rzo^l~x~u-YKvn|{XyuG|K#kRZGpVY0 z&|=&FB%xu$z$G5uL}8BO9b4zZ#-k2wydG*B9So~8_4Uv#f+DQkf~zxR_dQg+$4L8J zJn!sinDaX*i)N;IY5H#lifhKFZY}G+I4JTqO>T+0EGYAP@OVzgJ!1r&)Ee3zD*Kdu z_uOZ!@}98E*_<$dMlAK;MlY=lF0gGKwHk|&e|?c2V1qTONPa_G5@dE`q>HzMgYmgz zf2s)$OOZWrlq$qlx#+b+R4xL>QS#hrLIYL)(Nt~xVU?eEa$>JLdEaxDmt2cCgy)YD zs~YG6;Xf_Ba^T*z6`*-dMl_wxlH8CI@=bPkJT2WE(myT^0h=@uujP1HB{v5U6L0&p*T9Y*e!GOS{XjH z?EZo=Y&FHgJVuefq8DDnIR?JYCNd5le)kNne6BqHX~^g(+-@u6`j;v{+mI(~F|$Vd zlY$hcXYNxX*B_wlW@y9Jer6$Ri5eGbV4Q{05Yf-Jlh(w1i*~yx?3p%Rc67J~WE1U)*rXyL`p= zC~NPyL;DQCZzfu|N3J#&cES-7Csm!KX(01r?p@ywqHvEYxXHzeKjye=D!wY5C zsn)$yGxCz{)Af?|aNBLKv%*SKd-W8FXDbu+)pJI))?Sr?N4OI%?b;FckFI}e8Ay7p zbGzh)=uC%FzvrWG=V==44cv?v%?Vgc`1C?fBct%!p{$ESq^^e%Prw=u8Vpv5zs zY2?gF)Y?X`>54vARAmae%QMQc z{!K>0#JjbEReESt;>GoG`;#m0FRV-?F6P1CKT>|LWCd?O^G-Sn1f{EfIuj*)v^ZQ? z99(JWV<2a0+X)$ajc{`4#%kUHwmQ}!PeogFO|qcY`5ts39z%#X*W;ZCk6#>t*l#cv z*UM4~9XI%4@?P_*(S563m3Jnw+CKvYJ#rguV0x9xn%<=T83%UoSrlA#1) zU-LLWX@BQF)!WJw^rZfQv{z^Hrz>A(c2CM~grvzMpI-=I&#o4Q_sl}9KgleXusbI? z*|@|JGERK>Sll|@oZZEEu~1`M!`XQYfB)L`)51FhjA}W5%isfsg9}8B^zX>A}l0^_R%b}B$VG>6{2Ap2b+ ziW>5D>iyNhJU@-{cZTW4HZ3NL4{N>S2~sHLdoL4zA=sK!+NQY6tvW`!E~QF zjOTnrryv&a|7Az@p1JB&9cTIO^O~y7hQYo{XFU%6qN=vHN@#>zx4;X;(u0QjibLfk zhQ`jHG06R;VH4f!)3KC3bMf?(O0i?M!rz3|#bR(6uptaj{!OKS}f+CH+EZ zu7}vHLL55js6sQe@Li}rn-$8AtP+0ccGNP`pc;?6RZxBPiCpqP#v;CSu(=}!jX|5fq<_n%RJmDKNdCMJ^GB*0{iIp2xhK;oVwB&bd^@inN2aA>& z{XVl+9^mBE^uK-ghM=m2`my=*gsMX{src`7!ts;^C})NfaYi3T)jeN8Nes7x{oR<` zM@}hBCi~pbU$aH_EEb=RWk>KPzZ>r#L$7-l^Gfk0$8qJH5uWyN;B(EiCvR*#7fr0} z{KOATjjD{*1v++EwD2%(^GIkD!iH50B6y;#G9t}y>VAYwl*ru9sK-TeksM@GzDO&x zeLuvGa=}CGm`1{?8T4HW3o55=G5J^ubzs!LJ$Zg*ftD$#X7HoRj-2+)EcW*msSl-K zOe64;MYQ!!Qg!<{K?3VhOyKJT$sUIw676H9HIDq}uwb5k=`Qfa_9D^H@t za%_Jz_rpGo)I|7otzaLbVP1E4XWwkjJ|tkvnE#1V=Em*IyC~k2gY;Ct`Ji4C3r&j! zVOqq>4WY`RJgk9!qAGHxyJ@C+r-prZHk22 zVs9#v7K$$p$jhMVrPChH`bI)vPj!Y8<2S3Dyc&8UoS>e%e%_Snd3J4M8NSQm3ANRf zWMFi1-5FZCWhL!S*nH>Mot)v?NkSGO>S0EuLq?gaM}GFb&VQ&^{{X!ZwNNk7KuW3O zjxlB78p>v|xk$DN(L!8w^R`d=A@itYJk1~MOx?J|lYpN-ja+yvmY_#tgCx4nKN6PG zw%DH_EUwtl9^8UgqNkhgYbjRj{Dm5EUcqPPU0(i1w|Zx=8cy5hm_QqbS&O==m@7LH-x+h}zi7NE_G^R*wZ zxHeOyF*zQ6IQ!+@`2^0DW6WxZojTFyW!WzadntjTEv*5Qr0P8bLi(Kf$8~Wau91S0 zrQVw$=WFEzZ=}MRG;=4}8M}dP<^`p~qcw2f$8bvMol5{$%L=4;4^Y&SJ^}%H}^9^Ng zy34g_4fE>bJwJ6Xc*X+mK%Vg*W*kw1e&lW>;V-T64609z|kFaMOkuS-dC@u`~Zj}9?hVyjk zwdFdF{``a>Lb0?ERf6SNC5g0%100gjXT;R1ec}AEBK7Qj*wM;tTsL7n2TS>+uF#(N zOEWrRA!JSc>g~Fvm(E}HdAP5lUTBUvi6$k4ygBfVIL zj!4bvQwV>gmii!p`3m9p4cI-rYL}kF4`C-n-#14-#363yurDY^Xd9dKqL3%SjOk}K zg^jXnKMswdpfn&^kWymL?^Wr^0e_MLx>cBqnzq$kxO3b+m}c ztF@L!u5($;ad}{_R`~-%(y?zJ8!$4y!1r{!83njUE_-0RU2k06KE5UVp!~=sCTeuX zOQ|BUzQDM!3t=?kG>%Wo`_t37Co#v}#^g&RNN&cJg6tE%IC|baVh3Pfnu4Nq11KL( zRHey^7%_bkxrmPg?w%TpRn<<-m;IJ~^QcRarY{EdfiL=Eg`E2Q7Q*wfeyoKtd@|KO zXDd!$$t8;2RwBG;I~UVee!Z%C(a<)Ev z&5v>!w?PpAmNVbad%ecI>YETIoPtsv>C|K1m^`ZUZM$q-Q#X)VN2PH<7rV+S7cogO z4;Sk$U3(<;v5UtRDFWB~?s0eJO=kb~5ZcY)nKyHD9c|JfeEfd$D7kIwD4e>&)MxM) z$_(BzWR$8$@rzbUgJy+#4unA-k%Ca{K94v{-PiAhBG|-?FfHdHH^>=nF6hvNZ$_dK z6fa8Fe6HVMl4rdbZHO$#Q^IZakOTP+UjGggxs&*qSf^6oV$PDM)cA}C{T<3%SsXq+ zaW>3M9%4j9v_fPtK7PWiIhHpX)*Y4$m#?fF~DU7Cmp_Ic5;VADrXHT^3j8vZ7QUoEiNa(l76xI4N}cg8VLstp=SP+(s*u_C zTN+MC;Q>)TJv%j`;>X^Q1o(W8u&Mm%8MAm1E?5$SM?WZyMs4)&E?k|bOg#^7zoYNm zqCb|CXKEY_ThvO+1wYeMhGv5I-t7!Yh4Fd;(ZrAe?YttS&aYEtT#Npk?c+HBhxA<39pQvT#~adt3KW}Tv5hwC>EnUv}Ax97>CW%1$U;5H=r%%f^9t0;c6V^%1*5x ze`#Z<@M4Ue!dD%O_q9?u8e>i&O8y0d|b}BMifVk+NsW7Nzo?wSqr|5GZ9<9 zPR#v4QY~nP!|h2m4pe4y0eQiXEyRm}$i`u1R>^TW^5kF43dvoTXdoEsKe zO6zfia@do`^qNJuKiAE?*;5dGv(#l9>ijmE7Q+MK&OH74z$h4cMdiYJV9x{3KyQbL z`Q7wv>F1lKoC;#&r%Vpu1601qWFpTM!)8qCj=cf>7@yN)GqS>AfgpcUV-ZVjT8h|) z=Lr^!{hCdNEy>kpr9aC$kT03-H7;aUAg{iSWhSBu9%LIgD6{M1Ft1Y zD%5jM1GzKyH6Gz0i`LT+Rjf1I-1e%KE`32CBkq-DSs4FPok@S|R$sj>s&%R^U0P0W zetBb%{1uCMd-~*3mgD;3U=%FOM{7Of>yISFVfVEd6LI1NG_;vAm|%i5A{SDWm?2Vw znB)1fdrDIm{1MgnB}Mzx~pY zv1&Ni6>vu+d|HdXtY-3QsZ~{52n!?;a*&3>9_|e9(FXMrH;DPEC$R2EfD^q09)CM_ z+{D%k+u@0J&FynuMR}*6e`xNZ?5AEcT$ff}CvVGf(D7ty*j_W2BH4m#GbpcO*pE9D zp-zk)gjApLIp*M%?ZJ|_;J_<`wYMBWJTGm1p=>k7+*LVJ0am?q*r1aWr>T-_I+c1w z77Eh(zGmpiso@%dVDIFWMx?!3y0yzJ3#K$PmKl@VTa4|MmZSZchCqG3D1y*eAnUQs zpQcV6NsQ%dgjroP)v$fk*yD?X>SX#$FL}Im-WJoiw#Zua>{5J){{HJzwB{lLTUG)K zB~hYsX`OwN3RO>|KlRWC-)Ypoq@`_EYHd(lNi!An@WOU7U*Yv41l$cDR!8fB^Xtj(q%p+!rz5}10tS)hm#&PUHeun-ekMVzl ztMZ|gTEfPWOjUU(zTe8Tz5SKp+kMS%ZpT(6bNsX6-^7iwwYz@1&#PoTV|sVVgYtgv zh!3+|5IRE>w^d;1sNsO^Tj%GFgI<|T!Xw^tJhT(1*O4JO4!34&<_?N;bCP_X5cnTIY zsr>uKYT}t;m<}*$^PNS%js0_{X! zP;1sZFs8aOd2~RVJU6y2|E;m-n1NHku~k{fZG_z~#!zs51|GK;wXKs>nNoA_(&(%D@UEhn-b5{5#_w2Y z*NNoP3%yY7&a($2L^y^Qa?~@F(0RP*eJknv32VC%X-F^n^}-9T<;uk%Y?N?N8){n3 zt8Y-zeA&Ha-G$i^a0Xsfuqi*1%(J@h9Oh8?{L?3V>~Hj^Ux8O~;@!n?28^&n_WPkox}oO~FrDTf!- z{_N|4ZFLrvna%Icw*{WUQ)T__NiL$kBa_Mup}!cmEG!JCRy@9D}iRY<{eVFOk=QrWq`E$f>)(k>d*CxUoF8%SBbghf;WP8`DNvr3K zoHMDeJ+;P0He~ib(4v=fmQbQRvR-tB)?+4g1t&102nQ6&&YmGJsIA^?a}sE^(se+t z1tXn)ETi5v47*zlzrgEYo>P|@-N?RC? zNT!4}-ugAOKib8BB0aUD+>k=E@g9=93d z#u+0u<6h1Md#u+~Oyf7Zn~SV2&yKM`Q1QDmwpL`9zVPICp5gd?vw#;Vg*ZbX0VC^QHPKF&{ zHFFZtlQ>_9yeE6{?kA^Sh)Q^EV`)+3xc6`)hxGVMQ7b7+X>ai0A;A) zEKn1-%O@Q2N)n(erX7aU*X-=6T8rKdarFoAMHxbj>Be6v%^zG7l5a!iuq_Qru9;Uyq_N*q}NDJ_xd;GU%{LLwfc(tAt8m{#%j zY53W4{QsZDcr(>V%?CRP73ek%V5tu_sA}(KJ3G!8r~Se?96VWk;9A%5bg>#CGNbN^ImCn*OjCm;PN%xi_7*t{QO8zMZGM zXol{GJk_5Jsu;-&fUZ=C>V%_)~Nnt3AzF$rP{sg?5nKp#`R$NR{p#CBJTq zm#=G#TIrlZE3$GIZ?#k0=2y00QXJ6|VB ze*HjoksVk*5f0f(2ZmDQUm2G(YU+%DfW)-T=#vx&2*|O^TWN=_O$x7Mh)GjN>aS!6Qh>2Q)*s4 z6RrLIc`iNUbINXlz>VeXIf-!IR3^m@U!cc&oQY-`Y(v>}QD<-`Q_2}5O8;*e2c3Q0 zdF*lwREmMFEd=aV%(k@oi{W`)XwJC4byc85>19uu2WIE=( z^l?8WHg}!&WB0vNp4vw5jU&&9={9>yJ$vkW&OYPF@TI=gU6?)cDKX;SC;i6NP_gdb z9ua%u*07imfph~iA9vCaFLF^i&q_W}CH?Obl;_tHLwCO9>ATL*-JrQB!ZVGn=PhJb)}{`JBI{-hthkg4>|>F3`uMA|OMZe*2qsA_aK~ z-&5&msqO@(g(YMYB*tr9V@gdmVI2en=S~S>Gd;nkr7g-{J|2SBghW<$48E(L0j%7b z_KEP%hmTd4qZ*@L)O&p`BtKYe*b@K^V%ml}hE;As7fa%MtC!eDQl;s^rW!8cJ}P=h z9vOHcGgSL|Cql}?tf@EWb0pt#nQsonqK(St3rP$~PX?t{(1k%;VfJoQa28<%EP;YV0%I>ODjyPN1^cY(|)+f^eRs zNro2cV!Hxg>(l4z7MAe(sr{F3jVbQ48-n3Gm+=MW3WVXy-jWkNICb|Z)cGEr7U3RE zTFx4C^D^q(jr0c7Fv|_-!7j8>aCUAGsh!__13^Ywa(i2osCa-c?oIrw(NR*U&gs~W zZNZHQrm)(%IfI+n-NnXlvW|M8t0pwYqux5GUeN9OVQu&66k9^(*?{bzaCX^e1T!h7 zRqY@q8`ao5gFMd#wWs}5=Cl)1MCw|Lgjy-5^ke2S#@R0IOI<+RI6j}!~PVYUvvdz zy@D4lP*vpj1xEHGbCJ~{dYLUsRY*TPH&B3cdj*l5Z_eigiSAL?&6jHk=6WpTj+v#0 zL!G7eX8p#A!aZU#xn>yXGhQX{o+hlkhf3Y5O#aFs=OV$fmT4}k2F872fdH8S?{YyU z_NqcjLnTW4r4m!hQgmwBAKx7JgV)Yu?6+^WY0SP@n^jc@#VTtF1hC~Dvsc8V`Fn7T zIoa(OW!H>oLoNN(5ayOv>VII`74#x3 zAn)h51?Zy6o$|$=mQ0nd`c)&=VN8JSzJ0B$Z}<>_2XE5vp*lp42u{A``On?IIb$mP zHt3BAA-#41M(Vyq7p-K7d#WpmeN#}Pgp3sxJb`yI1S(t73>5}(y!aHMLGFI!H)&4L zS42qxOH&}?Nb;bqWLpcnp}j`T9-a8^d^9aO^k9cd1&I_^ zGiNpDFJrN3a>VTy47T*CD{Rue-lRy-WOX0G+AbmTKbw}j!VB)jk0EbHHd{zU3ptfO zzly7H#!o-`E{!3kj!pNff-a;58g^4cEt$@GSZ3eJ!LFPck469o&K1ia3h-gw%<`u2 z)Q#n|B@$xcQLRV1wZ&zx-fV`VPB%t=5!519qt~+4H>#Z8{9>a%DE_OX)J@SIX~vS8 z-~w|DEz|Jo8l_I-ZTm~=pBk?yL~ZkFLR3ZwEQmPGwr zpsXrPQal&N`IY?J+W^&dXid0t^o>XFEGITYU#1CfuxO#=Y79qp=3vmCv#b{g^v(q9 zEFkRN?RZ0IOhx*<$xlr5Z865OlZ)@Ym7m^cjs8ZCe|*~Zl|7cdPZh;`M599FTC=C< zxcrei*2iH@{krh24{*IFR7`B&DLJuRNKg;VeSN%ZF-ZY8 z$77UQYt}co7BwBR&^3spE!Vu7m>~K+D9lw2*S{OnV^;D`{C4Yn?INzA1J=zClp&X1hVu&y~${4|}<)y>&ZRQy?_LzI`iht-RV zAZ^cgqN7^-3)!Bi*a_n11q+bw;ILsOUnlSt71J-za-*J{IxH=WvFh@#JQ$rr9F^|HDLo(q{6TuP{Jyl-z1+{^Hnc~;EutjXcK#;CYaFl(U{w%cBN<0EO=uknhq zssh|UVsM$)Ss1!g&wUG1uKSxcdL$15S}|do!xoyyRg_Fa=+De$c|9f;GD_MjIYz`G5~K8+ zxfXbMSAwT!acb*h*+|u4o!792?G7ZUrh|e`sdIy%w}B zdKwqT?c#N}?N0touc=&UMo_x;?3ZUZh@aL{S!&|IMmYMGCf;BwH9z=Qr`Xb}akHkk zv3W|sD+WPp@da-4RbGaD>^lv7mkFsgzlKDfntl40W{`3o=qB}iQ*X?Q9xJVz=zKam zasAv8(ntNWEaDhNmecj;KFi)WAb5#Yjwnsu03D|R5BnpF;z=CB!`>;kjsgmwI~QPT z3^5H~a);3?&OebnP|C7*e%Sz>E)-|fe&78QPEguVCz<5i`SYHdA1Tw!VIobOR2|7f?FNL^|Dn*spNF_NyL13hONKzqlY5c)@%Z|CwAIVm(L9WzTi{N|E znB9o>iUlplCj~AM_}Z6`x2e-BX(BWwPQTo^bc$2ic@Qbvz^_LnDgwnw$JpkYIaui@8fb{TG70?RX1V=^Z3@ei2MT!KHk$bz62 z#?*zMwu#PZHU#Q(Vo%bRk8C+6-#{VOX|<%llh8Ul+{|^*N+e7aEkhh3`h}uoNbu&TRC6QQOW8BJ27{FF%CmG{kdf+b(|*-XS#ht=_jgB z%C~3gf3?Z!+{A)_j@YhJXwTxOj4b?dU3s0&uM8W$!~55La@www5hKBXU}L?D^@y>n zsj$Z9#+mopS%C`g1ajsh;f76J`V?NSm$-;S{bPUnbQA^*z?~x+=-UU&$`DUKLmZ ztat4aOb&fMAL3Y62_wBCWfv%)nkek1ZP~=*>!Hg8xXx&dO}Yzuy7}bq)AbJ0-B1{1 zJ~v-<+mR2-FE$(bFw9SrvlzxgNTvw7(emstjiWK%z919#)Sph{J~v4o?VGVWtG~qp zem6q)fw?vaIy?)o(gb1r-^_1i>c#-(CVZxTpVd!0OWc*4=xc*8f&eHCQ$@}gp#k-05bVe^qkl^;{%Hm5 z+F`L<`oknyfbn*iv;SH7`;T$h^}82S?7hAFMZjVpo0LEIK^+h>(*Lc6f`eoFkH%3t zOo|x&->ahluL%fYS`C~;s^G5^r&mq_Vs1sIvwXDAlx8g@8VCQukxa$t4;pz@|JJab`ag-;DkMOs3zj98&3`x`w+r$dB!3RD zcSA@c3&+v^R4=k?{NE-b{xrV=c~Sy{J+MJ4+yg-b61yS1Aa{;`F$<62QGuIon0PsW3W)E4aDvYN0Bta@5r|ZSSrQ(A;e(Qh(A0d5Y!K0 z0h@jMgG7Preh4G@ckv&@4j4qhbUqBgs1FT)sg41NIQXpZFA^DqNP~lC|DxJKh$t9$ z`!5RVhWYnu2&Vk~@-Ia;1VxIgV!zoB6)exV_eDUfl*gCO}Bgb&1(_5f_gV8h`$1M{VPB+ z4dDXsvx5OIJTNI>GYuPd-p~J{mTB0qZ-QjE(Wn zbnxA){^eh9NU6cdK+V0?uvy#N(( z@mGKm^iutSxt@ivgN#fbfYKa97_@AD4|G9|)(<51JnRWYcYgpD^RP)8F5m%l&%;U% zAMyaqEWkLx^gOI@nFZMVBNO#N`!7IvK$bE0fDq{U1jc-*`RTpP^gU?%#j?ioFh)-S+o8Db7fXn!z?xB?rqO5H&HDg+NuT!E!^^)Eg2mu826GmE71mkV)dT%}6~YGIfNB_+L?t{SeZ&;i2Z-_ ze_VoW!QvFBx~I)S)ie*ySO^HnK;st7p(NV>WAGmoWVT@n$4?&U@NL+bx?=(cw;`|K z`+>AwSRxo@5YPbO9au3OJ^-S7u!KQ(U@4|>c>pfj2-v{beY1w|JwpPJ?ZP_J@_PW* zyD(X@pa%doLx2Ed_hV;h7uG9R!~>182jK$cMBM{95M0vzl^n?5gZ;w(nDQWTa}OSU zdjJaiu-ZS(xd*sF3=DuzauEQ*AOy09;d;Igvt6AB;2uB}K|iVhqXUREC?0lZ;Q{0o zNURySy~j!efb=0m1`M7+0OTqVNP(h5*prICfdJ@rAP_&yVjF!3|9-NI_7H&cJ_L3^ z{RkFq&Sbjc>H&D3!dmE*9{_Af;{&-o zkbh4kAGIIa$j@M|C0XBtSD+LJAoUC)144DYzv2RSXRsNg(D?x|4SrV`td!mQ$J&PGwT0-=@j|V5oaHr|LrHHPJO*+n zkXoWAqLP84!LfTKO$2VN407-?6+967K^&<26Wbvkg*u~l*p=#aug*NbdslbwJd5Zb z+8O}+px<_X2nR5RDEUxmMsone3BUEALV>~%3N=UHc}(_`F)^M_qMDIzGQz815Oo!7 z9)h6Zk$#f`di(g84akxV-gzhftbvr>b+;<)RKM8(RGm%=AG3cd*;~zo_>!l2bo6-m z4vL%0bjgp~cl7(+V7LuzpMh+1IfO!O#}3S3DZFjU>i$jX0ikiiKM&L8W!A!;HOW2>hM{neG;t*m00x4`6cM*&8!! zHNO?uUYLb}b!>O{77W>1p6$hdBHHtMf`JG*n7G`2;+^(Tk@ixJcgMYy?;W)59JxCD z?XH*vU&to1^|5974l=zR#2t$QL|pgOVDO|7-Z z9nW(!IHN@5NgqPJ2mqi$a#=TyAcMgOQeBOBwkhnzQAT`cL^I($Bv z4gQe}izt~F*Vc!H+EM4mlzY;mj-VcOv+R4(doOB5h+2<- zI-mXywI7Svn$PeCI{#dgo`{+&vPO>8R6qOb3#Yya*EdF++UWB-X?ejT%XG;pc>H)W z9NQgCwg}bOh3st6uxf@uh;7(pVQjKKP!2Yt9~;q!O&0SttTFP)!n;Sp{F(x5-o<>! z*}Q6azut( zuJGMezMQkTA9bp}m-FS~MB7GGsg2oi*kcWXQ($Pu9|U*4Xe;ocX83&!n40W=0>s~) zNS{PcHu~th>zw{|=-X=;+sQrreqW>Sm25ZtZq<7j;J;c6Y**R6z3>^`$SLh|n>Usw z^PK5xlZsMQEM;HFJHXqxySy2(*)Z%4FR$ET(A$=lN$>48Xt2!%&C#|j&%I9m*SvE< zRL7?suiI?dBi25fd+sEDlWrP+%WN(k$V7e86mK}*dxna}>lz=IhcNv3kQK?iy5Y5Y zUg1s8_^kRH1mMfce<+11jFT#WAF%?>m}*6*b*>$gR_* zZYs$;;CEWuH;TkPcbA`f=l}5Vp1w6+BPSR89VD>TI@Z)3SgZo#2~4!cwaS_t{Kpf; zwAqu$I}f?(aPPEi+4h{Qpy7_Z`Zk5(fU(Qzx?zum`!GkBdsioRxI_0{(}DxBLQK|u zoF%-Ww&s?(wan_chkxsT@kVpYnh>XH!(GOe{KhNru74(G;`UZs2pflE`27a!(p}&z zKy_rsWh7zYd9KzLFuYdnnnp<4S(q?YtQfhbb7ZDHQm_RW;Z<^;o16I6>}4)oLv&hb zeK+53x%C$Bh|_!b40=7fQg%6K*>>EQq(H9#wmcFIuNJEX(|6T>JEN`VcX_8h$;&gC zXJ0&_p;n9IBV~!?IjS_z>81J>09{b7Q8`g{c;Yj+Wy9%%f*QOQXM0LCp2Vlie;qgj4asdFp8TkX<^O2*EW)Lb)q#& zJ5)8TAUz8Kf1782&-F|r_iA=h7`pLm?C6zwo)}F!Z!aZ-r4UGx5?uSfGVm2r_Vluv z^X6(ppT<}$tERBFEx5|1QFMhD(>l#+3f{H54PeUKo~-j(JvS>oRlBn904aFCw{^F- zx*F8n8Lg7nG5hhx+*l1u%%eViQcGUCD$4ZkY$$c^SaWZGyP~u^%*RXcFN$kE-#EIp z&TKE+59qH&>sEK-^I3IG8}Y&9%j??{wGjpMH(G-}to5cazHEv)UF~_q5C2X#9wUH% zIMHu$r;&P;tzO7**c`tNauH`rIwEK#kz1sfxZ9|Jus+1jwwB$&N+f+tK#^IZ<=T*uU|fL&Pb}>Ebr+Cz4p3TOZd&} zT41_AFVc%hb^Kk0f!sssQkyj$pkVoN)63!-T8{pUltH$T5val+PEtPF^v=Q>g z&Z^FNeKL4D7{+Ns7gn1Ke_{2q$#&OL7ESF|Tqn4Hby#j!bZ2?FbB?Ph3E>1otbzojex{^FK5TRYRu!ZwUPpP-p1 zS9AtD#~{(+iFV(*(14?1wFcR+cqrD1@;}zt8QHk^kcUqs>M=HD% z8Zxu9r?5TsNWPcEwL(V552g<~>f^o?atpYSVu3kmk3xy^HCVE zup7Css&c-;?@z&ir?n_>h%xIH4#U6fjvf$qffq*3+HZpDDYaMWHJ=<{ZgkXtx)x8z z(J08Q0a~FG&!7Z$j6cU9%!;e+xbps$StY;Q)f2wa;n12d@hYEE=uHdv85l>{8YMMy z{(Hmg3Is@LtBE7?Cb75uw1|=5VNeDv2O20-yA3TEAojkY!pjsZB;=N(345dt2_NXd zi_b=H6Zz9t+wRMhWA>KP(+U%Rv+djsqPjb%KT@~ON>aU0if-MsZ1m?B$zsG_B@4@J z%CEFG>~r+RF>&*Pr;T7|uU9DssG`qboDCU*nH3oxm%yQ+i;2PV)m&Q6+T7i)bkBT; zQeMwW1KU=@4TbV6O(>etvVzhn=(5pw;>nP4a5H55u$rpl*?1>EW*;1X78RwlDW{>& z4+pKlz0}uK2JUsONY0lW>H%ZO&6ooE-c^mSLd_6%U&Fno3K$oASQ`A+m*S{QX9-^8 z$G;p6or-odO{Q|E_PH{D|Eeo|X1?1dgQF18i-}Wh)@1%i?oZIEO+=LZsq%+M zLGcJ&LVUW*5atGduihrQ|F+SU+u};*e8GU@bkin5iG~IY=^i}IACos`uXS#vy8L|8H|CRUDQRTkcP9PEH*Dg|MDeC2&$ z3fsgur-3IAc#O?<4Fw+4!#UbGD~$s*=q8>$M`%MQK?`*Q5B?vsQb6bi4gCvS8@?9+ z0F+t)03(x8cpQ^sM;U*P%aZo)5}P@?PS#SkDtc^5@|_b!#RZ9hgj*1xG2qJbW>b}) zn17V+p7&rtQCD$QK1AhOgEKunjh>!^yB(54>TkG8JMn91aJ?dQ&$o zE9?9FqPo{VH^u5>$p`vpwOwzXSSnbxF%W5$K?D4^$p(YGsn>t3{V7{d(rsHa1%nN$ zZ<3bxbyKttYca{wLWsp$k)mKvuu+lM9a7bchHuJr#n-%QMIMRkO*G# zRa-Y=Ql{&4k@K|OHhiI*HHI_jJdu6Zs;h%QYh?8Sb7-LS-YVg^8V`b{o?%S=J@;hRWL?L*>HbwadbXkNOXlUyK4YFgkL9@XY~As!PU{lyUX*%_07@E96R6sesD9t27*uj z4Syd@-VSa~&*zsv;4J@ma4`7G{Oa4wYwUYF_~ClKI6C|85>ed_%lqZXs&2R#4!)aT%&%ZROa4u><%4T{qno3v@8&l|?)CRTCDHIt z1DJntdT|5nIKG^poFJu>KT_-Y<-7Uf{ru{9eqk_C`{{+#d@wn9E1Qpg^qRl^YA`@u z0fIc!N@H@*+cRoQh7V60{_}7&KtZibz5wZW0Iz>hnAYEeV3hzo8t8VA_F{EhS9x)t z3>f@9NDwhg97N;Mm^$+swp%+J-o8pw0jsy>Bfjt3))aWs_ z0%3o!L)V+4XqU9SQ zC&0fq?24z^n59)(KK;V+1yZL|=`JSs25O8?Mas^K`-k=jH1heG0}G7)#G4jrNeK_^ zTlNTwmZ=>k80he{s#~IC0}7R}9Y4TAh>w3yEaUi=wJ1pCF!Og>F&M%s{wPO{>1S4m z7PmM%d&`zM67ch%wgqSf&?-0;LY=1%X$f;K3t>h~9Nae#(2kDy*yz5E-nLX6qtLjhf&W;9e?l}~K>gaoa5+Z3y@GtqJ@nSqk~si=R; zlo*910_;Pb)gTj3xS-kyxr1?jIbG4RLaVz}zUHdl!UN3P_rC6W7FDL_)h|?WSk5JC zQJjXSFw3@rc@kQ?Y`ZBzmx5*;0c&t9F~{ZMi5>>jBVyQFZBv)*zNxpHv5TL~`2)&j z1Djg{FrY%dG|xafGT;sh2JF&w;!1y4TCTRGv|7s|6>b-r5~NIH$#Rzbwa7R=&`WI6 zq7kDWY;vu{lz}cB!#_9p&+(XT-he1=LE4C_|Nbx3BN=TgQt;3zdNBU91|d^#e7HV;M^D}cgw0V2O$YiuxO?&j)XZ4pVz5-(vX#BXszqLB`D=ecv}L(6 zNSTB7C`Y%T;VWKz(n~IpjGbpDKqAWwb`VqM6!<6Cd75vl)%3%Emblx||5}%mG|ND7 zPDmY3lo6aL)uQb&8a+h)K`Ng4&*^$oa;tKV>-8F#1p91-zT+E)PKniND1>?s49uW! zzWIjz10!d%sPfutH?9PlIP-sF2=K>sxm{Nb6}A)v9Lv%rZo=0F@-;hPFzKO>REj8K zq^Br4+2xK42)^8)K%}MDj6^I}utCsV0~?w}kqyV}H82-ACtlx~sXzxkq^LJQ>4V+` zIR^c@DmK7m)c2%4kX0lFxQ`GuS}|}vCf=O1)zqZJRvKaA}g z&CHWWk+nKAIdjZYPewp&+{huchfnQ5vn9vQzB6;c)-IrpCwlgofS1#=3Y_jeZ*Y-> zmGVA5N|eF2N=Di{kh^~)<2hS^&snsKMPi1-yX4k_6xC+iE|A|-h^r>=;NgVSi-9n_ zo}vuccC|Xx3NLw4SG?1?Xh9ulh#>@Z9OPtZ!OZ%^vFIh8vJwndr~47jM+@qi)Lz&d z1T<5&tVzaCKs!5N_q+m^yw(1=0SkO!JHUwzl&CDSqR}r9k12l~Zi4d&%$cI4KyO&O z?zX>Lv=6}WOlSxb8MrWcUAhnyE!cjrkjq+d^C~JLumGT078vV_9>$6kmyUKFkpq^X z%%&_a>lA^bae%vOU|wJ+fI05PHK@@cW*%?hLraF`bf!wEZj zT~yuxcRDA~a0Y**Klcx?A&AR#O4=ccPz-Wx=VqBUk*<+7e6VFtVVf4P zpTX0X-kNmErB6Ar=I*Gucbs&|Z?lEHcKkcVKI=4i{dxm;2Hi*lw;~JR^;%uf8Kmq* z#$oZ-XsxxchXx4!8oAj;j0CABIw=2wP8JwC=p+ZyC2N1e-tjyozXu#$qgp&1R=f03 zix!NaFy!(u;)a+6Y*xys2boZQ1cXeelhJPgzb%!f3{7;KenjU9Ej60xm30e6cW!SI zt>`*4s}323NERsm2%MsBG7uq5k5wyJIl+x#G0r0d2NDTbDrJf$ooT+s2#n|(V~m0H zu>?XRYe;_|(oa%u1$jV(5o0d!31Ire(Zo1=?DK1FXb)dc+2O$#Z@N%xcZE~NVfezp zQ*scUtQClfp~+=H!`C%Xh%{413j-wdUdnY1lIb>&X`^;|pcV+d$GpsI+>3zuKx86`|Wh2}a6sMtgE${)^$ZYzV zXwQE`=hL4|=kr{=al_r+L%K^PkeY_8_>`USo7 zFD))jiw2)*gYTin%%MAKmut|wSRQ&*bE~a@K$Y3C;+gFi_!M2~1^=F_Q zr#FM|g!Jx@7tHYWSHWhnUdxURV-qoeM8FJ1;6^K-a^w8h#{Ddx*;_WaiL17 zP}+gR)_GFn5R|Nef$hwT=0ne@qGfG4bYjYpHA=Oh+CB7Du5!6L*#8P!h+_hnEI`JW z+j|O^JV0{Ex;nlPn-b`UJ${ezHkcR6t^wcC^C+r}|Lpo~*5%FZZr2ua8dnD30-ArU zycd&AtRDw3_J=!(@|WUM=*}5A^~uP`u6l8PgC!^5Dp9{jY$k^?4)%@Rs^&W%kDF4l zCp8|3n^7>&;k%kdhklHbOP--6nST0!+69_vXDFb>H_S+p-{OPEO)iXzcLc1F#uFJM zpZsoP2m!!6hal2)&9s&CyOo84ZuNiUce^rXo_&z0fdKGmd$?pB1y8HjY4wNU)^n5` z4B4U1gQ1=R#zczO5CD6+6oI`4A2>YhoP~a;53XIxh0d<8xQdxW8f4HG>jO2maPky> z=VoVb#UB^$wu>UCL{PL7`Mu%U!pVBw-i2~!Ex^n*8iDkX%0+RZhu|GhSCnPSBre)|k!{!Orvx$hfwDSG5?{bRVqRWxtE)=pujbjBwEV>{A9SFkvKhi9#|9kXh4&gzkgRriqaM+DL9P zJP9UADY#HsV^S}9(TH}2E}fmLp~ZJFI5HR!(LE*42{$9+@qbb(UM3Xq>^Tswki?X! zznn6N@U@R$L6W>kaz-BZUj3HK2ns#G5WUHd01WX49o-I+7Cv9j?hFU)F@O#aKhaBuFXp8`h?6$u^@ca0xfR?s8X1`b^h zFWqd)CsmF}3p!S>yrANyG5f9h2?2=kZSD~aEilWV(^0B=r!ap98#~WN+-YKYFfI#I zb$yDRgGlxh4la>hTrl?*{d4j_H>oTVa&XE5*r>ax@D`QOp@mC!;xT83Dhkj{Hu5yt z<-0S$U7w)NG~)VnmSOosjusxAm=gz`NYn-e-(Ke$)TxbKD=M#&1x^^O3WC- zSwPr_v5gTnQ!Ia^O34gPDv7{ihX)8>%BMXD0@*_ql`#kcQwCn(8!S!hM!@cJIeP)w z@wP>R3k+C40eOB)8E0mzPxlvOI-iYwJRlBqMvB$Qr#P%r3!R_wH=&WnQm zFE;AfajlL*GXAMhg_wF%q84;#k#9SZC*XpcBW!=iDSa+sqf3@ETi6ieUXQhnIYYhz zwZX)X&m}hefn87kd7m4n_%VW1k93`CJb!TN%3_yLk{`hvAA70sk<3fFaY%qDaEQI` zNm;LM5AP-g{y*efibn?6i6ib*{*$g>fSQbi1Chn{xuI(jb_Vkb%j6?}63NI9%q8tA zm5P7=OMJ(e>ZK7=XQr-8C+clO?pP=S$O?~1cejss>a9CiOu1+e^L7lSIbdYE6p13{ zjlwr#e{$PL?6(oGrOTAbc7u81WbsgAIJF0Y~)lQ9gCWh>GE1a(YSbxmri9l z_AY~-htj&wSO;fP2Q@hQYF;`N7GU4$x&u`RJ?G{hV>XODv<}{(qvf2#K@gh=JAHrA z*X7L}e?A69C!ZcDwfL~MKstyBd?SFQCr=8}rHX?+;wM9o}}%7?HQM|y-Z(5>bgOt4#emT+btLq%`u0kpC=4L&3u z{!6LHrLp)NHHjdu-nE-UYbbv^ERvEkS7aRI@=_WKn7fdZKv4ElZpB(ljlRMb#!eBP z)*U3L35TNCL(F307+v(C0-1=W)DX7-aigusMyX&fvGpvqF zH6Tk{;vV1!1Je|kY^B@CI~9r=9M>uuGi-jKYrn!h&;qh+EUWkEZrgv7%j}zXwJ%&* zS(ehXqPk?o7O-^3`eWpE^9hv>ntR+k1VF(fi5A>0jIO!2ZhX7-u06V9Z3^`9Aglo2 zREzcowmMh7x5}YB;|O1=d!f$FVU-&V?f|Ikr8~UfsND*a)_z7@&oT_Ndgij6cSkO| z=lp?+Ty@zkItGh36j*;^(L0D-t8NI1V0WprOBvp`^XMBh^(6{(-_8H{1$YKL&(S~i zF2w;l=&2U4tIfTlgE7nLGZA=g%_bCOX)j3-Dg{q~{-Q_|Oy4ui@YXU*j==B6op@t% zesiK?J8T(rYfBzBSm$R8H^HD%i4eTc5hll|Klf7BZ1yKCw*KQk?}hs!tq1*vvp1ul z-23n~9ZB&}>=VBBF{K|U9@}~#Ju$A~)gG~~{(E6WtN-?kZ1tZL;d-(Vhu%Nf)qghV zeXd5UBhkLuib!;pcLmrV^xVF(u4GrVUo1yqI>diN^#7vstBOwSW~wrnylLtt8J<>D zU$nBm0-F3`cikIUXwEL6b50|pwDq9-4Lkaj7A2Oq>vl1XxnwcNEpXF@k(UlF-acs? z{M#dS)6d6eQ#ii+_L4jfp?^UxM^9ib=>-l$iU+5bdx3khk>BMSSPIRShKF+IK%W5&uHj zua})G+|OO?tm@zM#LitUo49ncnIiqfZ3#E04_9DdR$%( zM|k}`I9XF~m~Gd(JTO-b{4*7F%Xc*#E$cK(R;B1;=o$254Xt7Z37%v?4-@4bW5sm( z2q2I1l~5J!|1KYKcEbF86`wkQwA5vavi+B!i7$Z?W3>{*VOWFz0kdOY85Xm=dVU=Z zcc}v{-U9#tP6_}3IFnI$9Fxv}29s%+8Gn^l+m72d5PjEI3{)VY60d3}2oe}^fCBA4 zwM~)aO%McH8rei7QYEQo)nNaMIsn5FLOC_I5VWE8>KDCwr)D$24oY6eyQQI zpauFh#kgiV6O>xjv}wgOHY7?6&l6pcEBmlGnHg zGD7QrDv8lP0Q!D{kD_iXZiPI7vtrP~7h%uvS*kA*uq-jBfW*J(<(-dONqaWV{)=ER zV#G1CN z9rY6*YXr|0dhU;6XB72|jYoZ$C3~<%NuHV_{*{8g*Tf3*%kKL+jocSHtxOc*-aEjC z=w2Y2?XID%Z7f4v8Cu1D5!^&5_@(dWbMm@yXX)^7@d(dhx8l;f9`+~`!GGuQ4y_uq#2oT{?^4NiQ_;G$M^Fl_yJM8(3ps?xk}i&pnb6JX(+3(Xh&3Rw@ldAUfz= z@~Sh!^sWV&e>s52#Nc4adzfpJ5D>^^WU-+0Un?~4Z%sd^23gW>N`4IOJyah4y7`qZ?RAgbJ zsAu0$d3R0aapiVKy}+-LT6ZR7fe$RYsIH~?`(jAl(e@i-ECjFu4=El>ax?q31%T-N1qM!Y)PQqA` zn=oDJI<2*x6+<)lG`LzINl2{u`lSCG5t8=ZN{ab@y7vj4Xob7Q*(SxD0` zx?yeEuy5>*cIav!54N12HSWrASD|~>>=zB=;vlWca`TUWL&48bhF>QGk1=h~LHXi2 z-1Exd;6@`AiTuVEy})k2z7D%>wtjaV!cVPLTJXw?hS7hsaG#L_gU+Uh&ZYr}&ZYu~ z&ZYy0&ZY#n&ZY&F9SZ20#ik}6002Bdmu$oZD}Q^(mZR9VZU26YsDn3+*T4&eFam*~ z!$5eCZ~}e(BOe)H`j7ADO+NbmAKkhe zvH-^L|I@K2`CjqnjmV*(vJtHb5!I*;BI&G=d<|AS$LrE>7B}1x6!JgS6I>8QZ8N+> zJ%0nsAe?l_FxZA9A!i*LI8-JQEDy6~BZh9`emgY+~oK{+3ROQ8fC>2(sNA}F+U%KVTT|HO#LIaruF-{tAa4D<7WIp zn>Xnb?t%px&;`587Yn6o2h! zl#P6X%peRn;i%qQ#aYtZeH#wZu~5fE=i&nGP7An$5r`yz58lT?RL6hR(Vy7YiKs4} zc00j07y=mcNSpdC!EZwyM`q5)Y&5QDgiHlP?qCA_W^snNio>z(f4G*P*W8OAT zGtuxtxCHW9^vg5l%R%X09lem_7JsFYyUpm*l6!WFnT_i1ARK%f6|ZbnPzQbz2jkSo z{-0kzaUut5MfPnB7ySc$kcAPXGlhiRm0yIZ%FJ$=o$Ag?wy}8xj{GCeb{eZZ?uXA= zepxH1Y4?lZ_F(B?dgTZ>01i2tn4>_IJ#oY$2CFX-t@-oD29w}FVH>(Q=zr^=`IXI6 zYzcNwrJF8`-qQogmb`k(`Qz~pi0ZhPN>iQX;v_HO4n}e4_iST4jBhgzy`&}x6})5T zA|KGU!ZjhAVV3kT_v^vjDui&K zt`Du@4=AO(-{62UC~TB3U`}FVPbE9>-0o zcmuci|ISc97`h?%yP%1JXA+}=#^%8mb!f}z3L{PeL*mrfpeq&{$ndb-L98~$nwR!au2{Y8z(Dk>lAEC@#4^extZ72@HR?z~GzboNG&oT|v zwfpA9DE5vdZ??$j`hR%*oWVN>sPaR$uJSC6`*!bg=jI@`9N$1^=YrkY{?vrk5mBlT z2hKYPew(0QNIs3$^=%jr&(opcliyzgHlZC2K#{AH;JPVscy{5XgCvqTVk-^hX&US9 zC<=UAfnJy@czIqFGlQ4AIf#72X-)V z4Mc9EN1ecVh<_2&myY@zf&49+%hhgC)l%qf-NE;vIdxT@jQ>ya%1`JQJ+_30Ac2!3 zRqt8pf-*PoJ~keG@F{)d)Y~C0_|01V?A+QNj-teOqu#PrP;DioiNA!MDD6SIY;F|H zXzt_&HrNdDN*=kap7RkpY4!0oXn?Bl{0_m0_nPl7ihq}cSk8N)p)HrqS|2LIZoSSJ zIpU|pMFHI{4Vg0Mq%!CzN&Fg&Wssp~DR4cI8ycyCuC|Vq)?^sz8^&ru|sNWJ)lA4!@2^r@yOCsk?rnz~@gyl7D!MS;bBC1j+*@CuRMW2uO{&Y276X z4T}(!+vV<-)H$hb_fVvEMbh9$@V_R?A0J!X5Al2DV!$atxw$)VAVquUfW4)hdTn5q z!V)i(E}%OZxe3TVHtZmR{0{xa9JFst>v);wQ8JY%2O8TZ`OpHE*Q9b@F3tzT>m0x$l)+Lsh(XXMgY%*x)T33xfTZI$ z8bOKsjznJL=N-q-pPuGn_9f^RJkRyYkb&L3&$ljvA_%D#;$V@T%__Bs_)`XNB@uxT z^gEJqi;gPj-%Ld-Psm`e@q8=<{E}0bOn>K3_lw8wg+;6*qI3MM#k}t=J^i4;zO~m^ zn&CempFa2@{QYqg8b0{U%*p0Dm|)wwE`uz7=x{`)mcVk7L+uh~0tzi_UvnkUr#lKJ zzBS@oV(2eo=$UwxY_=mBGOx#lqXEJ*TA3<^?7=DS;GfTR2oaJT$al@PLQykoecQ=VU9;}RPL@+ji3Z|4Ohh%hd_!t8A zuZMs<&HaP_vkv=Jdaa*_0d~HqD!}zxPC=kHcr*cSa3DL1VE{6XdKlW^facc4= z^^>Wpo(F-V=J?Qt1jzjmKjAE_}<0OPm{Eome){^BJ#UR}r{fNSu(z-; zf&n)Q6NiX|UTz}O3xzClEnqF@E+CL8fM+Ckh$Oy|?sZ#Lgu>s0Hl5!^{?DJD(37dD zq6nhu=>P(R2%_?d+kcl*0p1c4hZyg8K&-xU^8P;TU=SqXx9ELa#n0w#puJ>zjy@ka z-|wz2MEFT|yr^TTEY67&D7X%lqp&XACP2`aJ()j2hxq3IZrCf~jM^=C;P|*#F@Hj> z{EALdQ8WHYC^~RexCND$X2OjR`3t`{%Bzo)S6Oh}6-)F(3x5uEZXJ#fjhU_>{UZ@J$x=2Ok%u7oc^`ceE^=99zz*;P; zY&RkC2J`dJ=bgJ7V)^7f5qgrlc~ZZUm4|X%->^4FQO}U-OApP`iaOf>LF66e>&(wb zMUy_7W+^A4rhf-S@PTxf<27_!hjG1VMTiv00CR=V-KNpMAl8-tvEBV{>fx~yj%Uen z_p(;`>CI{cuof$HVo>-Nr;A4zh%wFz{hJS~{i4HF-Z5}AA67jun&MIw+ zBItk~%$Y2c+}h!z*MCXWUx7bM=3me~Z)FDxbeK@S`{{Vr?^#5e+-b6z3lNM>6Y$AR zJtIQq?kE&T-y%oVFPkhnGWvKHSd#=SEH}EgU>x}yOMtOo^Jx)*=f=Lq3ZUWc4x$kF z{gA7?*?$^*e<(lu{AWKid*7b=HpAgVM$Oi`>~Fot@=QwvA+fBP+mrsqOc`uzWI+-n z-op1m6ScLUv|AE>4q=tLiwNM>)&1nRXIRB1$O8hAN&U|C1?CVll?j=3+p*^z0g=dC z@UiW~%>Qta=FL+!2x#EyJb!+s+i)%B!Bm8D)S=SH2b!ryp(0_c1OJT>Su+V7EzgV==G%^`wB*^0#AY!F10WfJdPw%)JTgCsE-@fR^oAzeD9LaL{0O9iQg7N5hGm~s{$#IIi__Yh%i~tkHVU(D52g4Nl-ivDOjk{$Sad z?ss7D2w1hbySta-_yWG+PEGIxeXNi`d4CY#^^`!Vxg>s7?l)IO%{v}O8PbDPPlyHZNtfIxMu32>3jL#|E5oS+MaXHc~-(%ts&skfki zYMUoqhcfNZ%9W-Ko2yapMdNzlfiSR-zG2Ojs}TnpxJ;q0N?W+%;Mc0#irc!zc+P`i`AG-b%GpxQi*S9z@Zy1)j1jyK-`prD^8nTkR?N>M ztF&?+V8}wv+9KX2%Fca$2P4?a34iMw{4{Rj{^KaZ|MQp6MX$32bGp-479aS1RGP@r zH>y`-qdHg%F~zaB3J`H*%5sMg2=#YlKX+h$C5p{fBQJGc3)*3yoqJDhl-Pvbi|G5@ za4-glxoPw`5?s0wL4QlMRjqe4XVQi=+qSqwcsN^=-r~(3G|EEy(0tIlmhhXsU zQp=AWx!;^9Ff10GkALGd>$nn%@$``A)-?P0lvL#KfGR*08l>+ou-VUkSvS@hk}#TsTCTVc(!EHH4y`O)zi;yNw{_z${Jiekehiv0{-I}{ zUDa-i<>bH?FQv0`5>*nOPm|(y;HkMHY!{XRpjDR%Aj%z#Lw~>Xr@y%VRIk5Y-jK?8 z4VX*U7Nw^*8B~lw3h_ts(9txCaC$?*QFw-;)37xkBt-lz)h0da>BmW-|IHM~wKwe= zbS1WcT*`>=7r;t@IhJD(Iz`w+f$tV>qsLJKcLYlPZn<7%1Dkl`sZ=81(jSA+p<7qA zea6|Z?2YeaWq(2nd=!|riLd0Q|FEx)_HlP7?%(;=-jF6MZ1GKJlj%K;1Pzq#?95u+ z$|E|+(>YDIXNKgt50MYOLw)VgI(ya?&XK6Q>%(Kp3qgKV>Ox#&P=vUa1rK&E_3&f3 z?@$>BpyQ}hL#ZV3rYIXU27H<+^KdgT8zVNyJ6m+Df5op10Dzx$nQ2{@1$( zZ@FcdHmGMsH zg@1I!PC=a$3vH13x!qJFL<1<-sN2O z2ao4Hw%3N)viyfuNx@!iEF{cs#8RE)*#5Yr_;2R@*+suKwd0PBRM*p>A9L*%(KBD{ z_3Lpj(pT7rBY%$O!n*~kct$r_^BRxeQGZU=n1@xcrLXTjJwHo50mP}|w3HRF54v0> z88-;UGqNo={-$Kd2( zc|DGr>?DQ*u3H5G|Am!53E#U^2AoZIQ;<@M7mQSg`@I0la%01{KnCZC8XMb}F zY9GzRzu@fS^l$825Uh1R``n4rs`E{u<#TtklHy_QD^krGVBq$YlgRm`q&xDL%Jwbw zu&wd%mNbg$dv)1MMYW#J?73f)hKAk27hn&J{I5Z&3I%F%i}z6Jtz2bNcb)9k9#z4Nr@g2 za2Bj|bnT!-*czLaJXYtVYzt-ZOZE~!G$)fQLE5w&^tSwb&-vb^%3{!@L*K14Wl>TY zln_{)Q95Q?8!Y_-*f&JM4f*{`}O0}4T-&j4BNWi z3eb-`Ur*RRhlVR3lyiI(=+$tPY?uxF0Tybm5dI~e*tsr9!(uDrWAjyo64%NW^-WPqDvd80Eg_?2yC^C%t51Wd6s3~BS(TN(MucA}k;)e#0k_q^Zfhi1%qydk2=Ek3y{-S5ny z#g8}C)$6=bezw2$ZDzvv;XFr%n#--K>-MyGV`0$)w z5TY;eesW7+{+@P`lHr{J3a%H>PVTz$%H#K*?K|1&rI(t<+%EfBphWuAnlgs;@f6ee zDfo$=l82MHb}x9=^su@4!rdb39pktFVZ+ng%)*MIP5V5T$efg z=R8;HSATb9aVQ+s>N8N+17DkZ<@o4_Vh~=^jl{W|M9pxlEy5!E2w97gGMTjm0aT8 zE#TK%wx{CqU^u0ATJwWM+4-$Aiiz6KL-*Z%>3=RaVIPft^>O@w|G@ilJ-f35?lVje z11YYL&Btt7b$a`}s|Qj#?r3uD{=1Tq5`M)zbF-ilji=I|yUgtCFA4gIz|L11->zNj zo6MaH{*7u4bzeKaYqZ0U9B<|#oR%Mc=o7W0=3Jjik@EvK}!IL#Xaw^=_Yrv>( zK4NLwH@aSn`@@OQ=^Z^UxXFbMMParbb;1M7`Uc!I?{DnSIX=!~lQ$W$&buSn(MibO zUK?D$f1;-Um=O0Xs%Osl!-(;7e(|eITP4&k7BUTE(@Tc@L8gjAM$phC_11%&`y{Og z!%FZ!-j_{ISGDt+s<^BY_R<)#8GBRD*djQiz9BNiBKh3wItEk%% z@3SmBc4L&=@&2RjG}=VqAkflkSQ8i_QdmrvjG9QYHQrgQpcSPi+8NgI#EA0Uia~w4 zGT&K>cTV1*J-NR6^*?mK3DfkHhOI4w!=@@9rp?O5!{lDw_i!0}P&Tk*Iw`xUp2q-1<(>va~jYLpTF{${VgAaAfrk=;$<@w^jD>3X$|PdCdNg{>5deD3T0 zWJz%gACF^Gp=(Ta?&OTQu}#|TF!2wS@`df)MK<<|8F$ghQFq@@i-Zi%=9Vhw8zE~; z^Io1AoYMTB>w8=72!-n0tgdy@{p_4s5W~%VPCoG11MisZtYAuZw2ffCIg>6jU3KW= zky-oPTG!qzUfUJvT?QTJ8~1K~a=tb;)$x^B?nnwIT4mnJ($(m&&2 zsi~^8fS97$vgiR@?(+ABH;)D^ntZ8}d&SeSyv(ylF8}XyuM?i0LErC?G1#i-V0WP2 z@ZH&Ky0SE{fIT}rm(S&0I`{N#-l2~8%x^Qd7%rZvFzcxjUn?1)44{)r(YP`l$tEUlBPf^;q`u zst@Fnrqo7}?O92;R}eQeDkvo0P(6Ol@NjkI@6dgo`i%En&TA1!O>jpKdW zyN;T=x?3eq+St@kAG*}V*S2E(1_U@IN338jb(j^tIuXyEQi_QnL zmCq{rk9+0xJ6v!|jo$ORe@%ZB|ISNXm5%bZs~4Avk*3LfJt6c<+|{EeUYYHqUhBTO z>1|^BYsF7(`;OlvU*8tFva!Ut&oBIlA)~Hgc3^GsBW*!x@3@z@UrbfYI@ksDd%vvg zzIpp>)Mkc(fClqvs&B8HWr<&_=uFOJm4o`4&RtnckL?gUrQ}KNsw>%mY|4?wjCc~N zshZNxOhKhG`88;VLXASV_>}UCGr~1jGH2tJ@}9bl&W{&7S*H6%IdP5Y`6$KQcdFdp zH^p?%OyPFZVzujBq$@_NuZ-tr$$NFAnKY`-pAf#{oEH6H@2z9HOZWF3)Y7l17R?%K zI?$9A0hC^SmTyTp;p-@*_(@U0Kf1>?dM;N>`;lvDyIPdTgTu|#y48y>Nyo@bDI5j+VD8+Ze9^-9vyJl@INgPffR&#yDv0aGB+fqpSfrUg#4Gdu*K!SF| zei-`YPZORbAIb~<9>yq1fsqvcp99W>U!L8yaM#o8D2BPbVC00XT28$ z4v50CTL)lS*YH2hAZ`GY1K1!m4Mc#&gV=6_5y`r$fQ&)R0J)P0G>5Qd02#toBJlYf z!X6~8m~~wO9u8r2WOpswJ;}3w5uh;P?D@73CGKmJxWO(ari(oNUCCi$nn=$4FCSho z%fwcp*&-y6U__DuTZb_nG@Z&OWsozB(a;EaHi?6wVayCwR#+fDU^9a0qx&^TKy#2J z2GU0`U6h~Bp@9)h6IC6?&33a!AMn_Vv?OSnIAGNAfyF!a=z& zi5CoxVid4p6r-ZfE(=Bqh?#);$pjKb;O7bqg(Z2jJ!qhB6jMi&!`Y+>tiEA}s7L~b ziddAA{*%f{RA4p+Yb)L7NLRVx?N z?Awt(wu1@?pMXaqRRB;XF(r_zj&KGr2Az;#MV4%bnrI=c36lexwjtug0COf`lG?x6 zx-96Pgp(z@i9@>IF%$H)9fuMiAupXl(|34!-CS5#384K0E^C4}0YN`7L&Q0hfQBEK z7Lsy~fYdvP7*Lyn+oTB)isuxp^e7Dhw@MLt;((}>A-^91^Q&NJ3bQ~>vk^eAMPz^> z58Teq({Mbr+rViCR!^FS+iaQ*l0bPLDwp5>mSZTaFrVcp3p8dh4J7j!3#^fc^+a^> z5W)wVW?-kD{6{DvvzQTLF-U;507?NTW}*4|7@<(+;0dpsg*A1?e<|w_Z4_Lag_G_- zhpj?nghAdMW{ccoU*&+(Je;8Y;w(`P9G!=;JshZ-hj*<0#jIEfpv8krrL4fINaPYi zh_yiZQV!xYWQBIhf6#UZhX;z)v_p>Qz{_I5*nx6`74vW-_>f@Wc?-6BpM=54xN!FixQd72tY5(IgZ{Sfd7TY z&U0wBAY3soF7TBPR|V0F$r7Mk5YED9gis`fa1*2kC7@dcn%RBL5`y!mfDtlN2yRIl zKMTYGeI;2C?C*x_h)}piaXln8gn++AaUH}YoCVg%_Oq<37#I`9wGhQ9@Kqca1x8}< z+6DbVEy+EAWQL;wH)