From 3ca1a390528220a996cef750430a8502d2db5548 Mon Sep 17 00:00:00 2001 From: alanlujan91 Date: Mon, 4 Mar 2024 17:06:16 -0500 Subject: [PATCH 1/8] add kwarg weights to get_bootstrap_indices --- src/estimagic/inference/bootstrap_samples.py | 65 ++++++++++++++++---- 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/src/estimagic/inference/bootstrap_samples.py b/src/estimagic/inference/bootstrap_samples.py index 4163eef56..a4561126e 100644 --- a/src/estimagic/inference/bootstrap_samples.py +++ b/src/estimagic/inference/bootstrap_samples.py @@ -2,7 +2,13 @@ import pandas as pd -def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000): +def get_bootstrap_indices( + data, + rng, + weights=None, + cluster_by=None, + n_draws=1000, +): """Draw positional indices for the construction of bootstrap samples. Storing the positional indices instead of the full bootstrap samples saves a lot @@ -11,6 +17,7 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000): Args: data (pandas.DataFrame): original dataset. rng (numpy.random.Generator): A random number generator. + weights (str): column name of the variable with weights. cluster_by (str): column name of the variable to cluster by. n_draws (int): number of draws, only relevant if seeds is None. @@ -19,17 +26,45 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000): """ n_obs = len(data) - if cluster_by is None: - bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs))) + + if weights is None: + + if cluster_by is None: + bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs))) + else: + clusters = data[cluster_by].unique() + drawn_clusters = rng.choice( + clusters, size=(n_draws, len(clusters)), replace=True + ) + + bootstrap_indices = _convert_cluster_ids_to_indices( + data[cluster_by], drawn_clusters + ) + else: - clusters = data[cluster_by].unique() - drawn_clusters = rng.choice( - clusters, size=(n_draws, len(clusters)), replace=True - ) - bootstrap_indices = _convert_cluster_ids_to_indices( - data[cluster_by], drawn_clusters - ) + if cluster_by is None: + bootstrap_indices = list( + rng.choice( + n_obs, + size=(n_draws, n_obs), + replace=True, + p=data[weights] / data[weights].sum(), + ) + ) + else: + clusters = data.groupby(cluster_by)[weights].sum().reset_index() + + drawn_clusters = rng.choice( + clusters[cluster_by], + size=(n_draws, len(clusters)), + replace=True, + p=clusters[weights] / clusters[weights].sum(), + ) + + bootstrap_indices = _convert_cluster_ids_to_indices( + data[cluster_by], drawn_clusters + ) return bootstrap_indices @@ -48,7 +83,13 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters): return bootstrap_indices -def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000): +def get_bootstrap_samples( + data, + rng, + weights=None, + cluster_by=None, + n_draws=1000, +): """Draw bootstrap samples. If you have memory issues you should use get_bootstrap_indices instead and construct @@ -57,6 +98,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000): Args: data (pandas.DataFrame): original dataset. rng (numpy.random.Generator): A random number generator. + weights (str): weights for the observations. cluster_by (str): column name of the variable to cluster by. n_draws (int): number of draws, only relevant if seeds is None. @@ -67,6 +109,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000): indices = get_bootstrap_indices( data=data, rng=rng, + weights=weights, cluster_by=cluster_by, n_draws=n_draws, ) From 38bdd3e192c4fe8b3327495e0e9ab7e2aa8413a0 Mon Sep 17 00:00:00 2001 From: alanlujan91 Date: Mon, 4 Mar 2024 17:06:44 -0500 Subject: [PATCH 2/8] fix downstream --- src/estimagic/inference/bootstrap.py | 5 ++++- src/estimagic/inference/bootstrap_helpers.py | 10 +++++++++- src/estimagic/inference/bootstrap_outcomes.py | 5 ++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/estimagic/inference/bootstrap.py b/src/estimagic/inference/bootstrap.py index daf48151d..124a5a4c1 100644 --- a/src/estimagic/inference/bootstrap.py +++ b/src/estimagic/inference/bootstrap.py @@ -24,6 +24,7 @@ def bootstrap( existing_result=None, outcome_kwargs=None, n_draws=1_000, + weights=None, cluster_by=None, seed=None, n_cores=1, @@ -41,6 +42,7 @@ def bootstrap( n_draws (int): Number of bootstrap samples to draw. If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes is used. + weights (str): Column name of variable with weights or None. cluster_by (str): Column name of variable to cluster by or None. seed (Union[None, int, numpy.random.Generator]): If seed is None or int the numpy.random.default_rng is used seeded with seed. If seed is already a @@ -59,7 +61,7 @@ def bootstrap( """ if callable(outcome): - check_inputs(data=data, cluster_by=cluster_by) + check_inputs(data=data, weights=weights, cluster_by=cluster_by) if outcome_kwargs is not None: outcome = functools.partial(outcome, **outcome_kwargs) @@ -82,6 +84,7 @@ def bootstrap( new_outcomes = get_bootstrap_outcomes( data=data, outcome=outcome, + weights=weights, cluster_by=cluster_by, rng=rng, n_draws=n_draws - n_existing, diff --git a/src/estimagic/inference/bootstrap_helpers.py b/src/estimagic/inference/bootstrap_helpers.py index 4c619a2dc..6f43138e3 100644 --- a/src/estimagic/inference/bootstrap_helpers.py +++ b/src/estimagic/inference/bootstrap_helpers.py @@ -2,12 +2,18 @@ def check_inputs( - data=None, cluster_by=None, ci_method="percentile", ci_level=0.95, skipdata=False + data=None, + weights=None, + cluster_by=None, + ci_method="percentile", + ci_level=0.95, + skipdata=False, ): """Check validity of inputs. Args: data (pd.DataFrame): Dataset. + weights (str): Column name of variable with weights. cluster_by (str): Column name of variable to cluster by. ci_method (str): Method of choice for computing confidence intervals. The default is "percentile". @@ -21,6 +27,8 @@ def check_inputs( if not skipdata: if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series): raise TypeError("Data must be a pandas.DataFrame or pandas.Series.") + elif (weights is not None) and (weights not in data.columns.tolist()): + raise ValueError("Input 'weights' must be None or a column name of 'data'.") elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()): raise ValueError( "Input 'cluster_by' must be None or a column name of 'data'." diff --git a/src/estimagic/inference/bootstrap_outcomes.py b/src/estimagic/inference/bootstrap_outcomes.py index 833c52d58..08815d72e 100644 --- a/src/estimagic/inference/bootstrap_outcomes.py +++ b/src/estimagic/inference/bootstrap_outcomes.py @@ -6,6 +6,7 @@ def get_bootstrap_outcomes( data, outcome, + weights=None, cluster_by=None, rng=None, n_draws=1000, @@ -19,6 +20,7 @@ def get_bootstrap_outcomes( data (pandas.DataFrame): original dataset. outcome (callable): function of the dataset calculating statistic of interest. Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.). + weights (str): column name of the variable with weights. cluster_by (str): column name of the variable to cluster by. rng (numpy.random.Generator): A random number generator. n_draws (int): number of bootstrap draws. @@ -34,12 +36,13 @@ def get_bootstrap_outcomes( estimates (list): List of pytrees of estimated bootstrap outcomes. """ - check_inputs(data=data, cluster_by=cluster_by) + check_inputs(data=data, weights=weights, cluster_by=cluster_by) batch_evaluator = process_batch_evaluator(batch_evaluator) indices = get_bootstrap_indices( data=data, rng=rng, + weights=weights, cluster_by=cluster_by, n_draws=n_draws, ) From 3d0467878db33403a5f46b79d4798a3ddae3c219 Mon Sep 17 00:00:00 2001 From: alanlujan91 Date: Mon, 4 Mar 2024 17:06:56 -0500 Subject: [PATCH 3/8] update get_moments_cov --- src/estimagic/estimation/msm_weighting.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/estimagic/estimation/msm_weighting.py b/src/estimagic/estimation/msm_weighting.py index f522357a0..9820ac45b 100644 --- a/src/estimagic/estimation/msm_weighting.py +++ b/src/estimagic/estimation/msm_weighting.py @@ -24,8 +24,8 @@ def get_moments_cov( moment_kwargs (dict): Additional keyword arguments for calculate_moments. bootstrap_kwargs (dict): Additional keyword arguments that govern the bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores", - "batch_evaluator", "cluster" and "error_handling". For details see the - bootstrap function. + "batch_evaluator", "weights", "cluster_by" and "error_handling". + For details see the bootstrap function. Returns: pandas.DataFrame or numpy.ndarray: The covariance matrix of the moment @@ -39,7 +39,8 @@ def get_moments_cov( "n_draws", "seed", "batch_evaluator", - "cluster", + "weights", + "cluster_by", "error_handling", } problematic = set(bootstrap_kwargs).difference(valid_bs_kwargs) From 7029abaf8f50ae76523605428804e8eb60ae0510 Mon Sep 17 00:00:00 2001 From: alanlujan91 Date: Mon, 4 Mar 2024 17:07:14 -0500 Subject: [PATCH 4/8] add tests --- tests/inference/test_bootstrap_ci.py | 9 +++++++++ tests/inference/test_bootstrap_samples.py | 15 +++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/tests/inference/test_bootstrap_ci.py b/tests/inference/test_bootstrap_ci.py index 5f5644738..b5379688c 100644 --- a/tests/inference/test_bootstrap_ci.py +++ b/tests/inference/test_bootstrap_ci.py @@ -88,6 +88,15 @@ def test_check_inputs_data(): assert str(error.value) == expected_msg +def test_check_inputs_weights(setup): + weights = "this is not a column name of df" + expected = "Input 'weights' must be None or a column name of 'data'." + + with pytest.raises(ValueError) as error: + check_inputs(data=setup["df"], weights=weights) + assert str(error.value) == expected + + def test_check_inputs_cluster_by(setup): cluster_by = "this is not a column name of df" expected_msg = "Input 'cluster_by' must be None or a column name of 'data'." diff --git a/tests/inference/test_bootstrap_samples.py b/tests/inference/test_bootstrap_samples.py index d05a869e5..54da258bb 100644 --- a/tests/inference/test_bootstrap_samples.py +++ b/tests/inference/test_bootstrap_samples.py @@ -17,6 +17,7 @@ def data(): df = pd.DataFrame() df["id"] = np.arange(900) df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100 + df["wts"] = np.ones(900) return df @@ -32,6 +33,20 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data): assert set(res[0]) != set(res[1]) +def test_get_bootstrap_indices_randomization_works_with_weights(data): + rng = get_rng(seed=12345) + res = get_bootstrap_indices(data, weights="wts", n_draws=2, rng=rng) + assert set(res[0]) != set(res[1]) + + +def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data): + rng = get_rng(seed=12345) + res = get_bootstrap_indices( + data, weights="wts", cluster_by="hh", n_draws=2, rng=rng + ) + assert set(res[0]) != set(res[1]) + + def test_clustering_leaves_households_intact(data): rng = get_rng(seed=12345) indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0] From b8644dd35dc49af7ceb6cc18ae8e32ec772857c8 Mon Sep 17 00:00:00 2001 From: alanlujan91 Date: Wed, 6 Mar 2024 08:31:36 -0500 Subject: [PATCH 5/8] change weights to weight_by --- src/estimagic/estimation/msm_weighting.py | 4 +-- src/estimagic/inference/bootstrap.py | 8 +++--- src/estimagic/inference/bootstrap_helpers.py | 10 ++++--- src/estimagic/inference/bootstrap_outcomes.py | 8 +++--- src/estimagic/inference/bootstrap_samples.py | 27 +++++++++++-------- tests/inference/test_bootstrap_ci.py | 6 ++--- tests/inference/test_bootstrap_samples.py | 6 ++--- 7 files changed, 38 insertions(+), 31 deletions(-) diff --git a/src/estimagic/estimation/msm_weighting.py b/src/estimagic/estimation/msm_weighting.py index ab7ad611d..a83876216 100644 --- a/src/estimagic/estimation/msm_weighting.py +++ b/src/estimagic/estimation/msm_weighting.py @@ -24,7 +24,7 @@ def get_moments_cov( moment_kwargs (dict): Additional keyword arguments for calculate_moments. bootstrap_kwargs (dict): Additional keyword arguments that govern the bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores", - "batch_evaluator", "weights", "cluster_by" and "error_handling". + "batch_evaluator", "weight_by", "cluster_by" and "error_handling". For details see the bootstrap function. Returns: @@ -39,7 +39,7 @@ def get_moments_cov( "n_draws", "seed", "batch_evaluator", - "weights", + "weight_by", "cluster_by", "error_handling", "existing_result", diff --git a/src/estimagic/inference/bootstrap.py b/src/estimagic/inference/bootstrap.py index 358a642fe..507d96eaf 100644 --- a/src/estimagic/inference/bootstrap.py +++ b/src/estimagic/inference/bootstrap.py @@ -24,7 +24,7 @@ def bootstrap( existing_result=None, outcome_kwargs=None, n_draws=1_000, - weights=None, + weight_by=None, cluster_by=None, seed=None, n_cores=1, @@ -42,7 +42,7 @@ def bootstrap( n_draws (int): Number of bootstrap samples to draw. If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes is used. - weights (str): Column name of variable with weights or None. + weight_by (str): Column name of variable with weights or None. cluster_by (str): Column name of variable to cluster by or None. seed (Union[None, int, numpy.random.Generator]): If seed is None or int the numpy.random.default_rng is used seeded with seed. If seed is already a @@ -61,7 +61,7 @@ def bootstrap( """ if callable(outcome): - check_inputs(data=data, weights=weights, cluster_by=cluster_by) + check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by) if outcome_kwargs is not None: outcome = functools.partial(outcome, **outcome_kwargs) @@ -84,7 +84,7 @@ def bootstrap( new_outcomes = get_bootstrap_outcomes( data=data, outcome=outcome, - weights=weights, + weight_by=weight_by, cluster_by=cluster_by, rng=rng, n_draws=n_draws - n_existing, diff --git a/src/estimagic/inference/bootstrap_helpers.py b/src/estimagic/inference/bootstrap_helpers.py index 6f43138e3..7e72bac82 100644 --- a/src/estimagic/inference/bootstrap_helpers.py +++ b/src/estimagic/inference/bootstrap_helpers.py @@ -3,7 +3,7 @@ def check_inputs( data=None, - weights=None, + weight_by=None, cluster_by=None, ci_method="percentile", ci_level=0.95, @@ -13,7 +13,7 @@ def check_inputs( Args: data (pd.DataFrame): Dataset. - weights (str): Column name of variable with weights. + weight_by (str): Column name of variable with weights. cluster_by (str): Column name of variable to cluster by. ci_method (str): Method of choice for computing confidence intervals. The default is "percentile". @@ -27,8 +27,10 @@ def check_inputs( if not skipdata: if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series): raise TypeError("Data must be a pandas.DataFrame or pandas.Series.") - elif (weights is not None) and (weights not in data.columns.tolist()): - raise ValueError("Input 'weights' must be None or a column name of 'data'.") + elif (weight_by is not None) and (weight_by not in data.columns.tolist()): + raise ValueError( + "Input 'weight_by' must be None or a column name of 'data'." + ) elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()): raise ValueError( "Input 'cluster_by' must be None or a column name of 'data'." diff --git a/src/estimagic/inference/bootstrap_outcomes.py b/src/estimagic/inference/bootstrap_outcomes.py index 08815d72e..a1bb494a4 100644 --- a/src/estimagic/inference/bootstrap_outcomes.py +++ b/src/estimagic/inference/bootstrap_outcomes.py @@ -6,7 +6,7 @@ def get_bootstrap_outcomes( data, outcome, - weights=None, + weight_by=None, cluster_by=None, rng=None, n_draws=1000, @@ -20,7 +20,7 @@ def get_bootstrap_outcomes( data (pandas.DataFrame): original dataset. outcome (callable): function of the dataset calculating statistic of interest. Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.). - weights (str): column name of the variable with weights. + weight_by (str): column name of the variable with weights. cluster_by (str): column name of the variable to cluster by. rng (numpy.random.Generator): A random number generator. n_draws (int): number of bootstrap draws. @@ -36,13 +36,13 @@ def get_bootstrap_outcomes( estimates (list): List of pytrees of estimated bootstrap outcomes. """ - check_inputs(data=data, weights=weights, cluster_by=cluster_by) + check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by) batch_evaluator = process_batch_evaluator(batch_evaluator) indices = get_bootstrap_indices( data=data, rng=rng, - weights=weights, + weight_by=weight_by, cluster_by=cluster_by, n_draws=n_draws, ) diff --git a/src/estimagic/inference/bootstrap_samples.py b/src/estimagic/inference/bootstrap_samples.py index a4561126e..c1fdfa15e 100644 --- a/src/estimagic/inference/bootstrap_samples.py +++ b/src/estimagic/inference/bootstrap_samples.py @@ -5,7 +5,7 @@ def get_bootstrap_indices( data, rng, - weights=None, + weight_by=None, cluster_by=None, n_draws=1000, ): @@ -17,7 +17,7 @@ def get_bootstrap_indices( Args: data (pandas.DataFrame): original dataset. rng (numpy.random.Generator): A random number generator. - weights (str): column name of the variable with weights. + weight_by (str): column name of the variable with weights. cluster_by (str): column name of the variable to cluster by. n_draws (int): number of draws, only relevant if seeds is None. @@ -27,7 +27,7 @@ def get_bootstrap_indices( """ n_obs = len(data) - if weights is None: + if weight_by is None: if cluster_by is None: bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs))) @@ -44,22 +44,27 @@ def get_bootstrap_indices( else: if cluster_by is None: + probs = data[weight_by] / data[weight_by].sum() bootstrap_indices = list( rng.choice( n_obs, size=(n_draws, n_obs), replace=True, - p=data[weights] / data[weights].sum(), + p=probs, ) ) else: - clusters = data.groupby(cluster_by)[weights].sum().reset_index() - + clusters_and_weights = ( + data.groupby(cluster_by)[weight_by].sum().reset_index() + ) + clusters = clusters_and_weights[cluster_by] + weights = clusters_and_weights[weight_by] + probs = weights / weights.sum() drawn_clusters = rng.choice( - clusters[cluster_by], + clusters, size=(n_draws, len(clusters)), replace=True, - p=clusters[weights] / clusters[weights].sum(), + p=probs, ) bootstrap_indices = _convert_cluster_ids_to_indices( @@ -86,7 +91,7 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters): def get_bootstrap_samples( data, rng, - weights=None, + weight_by=None, cluster_by=None, n_draws=1000, ): @@ -98,7 +103,7 @@ def get_bootstrap_samples( Args: data (pandas.DataFrame): original dataset. rng (numpy.random.Generator): A random number generator. - weights (str): weights for the observations. + weight_by (str): weights for the observations. cluster_by (str): column name of the variable to cluster by. n_draws (int): number of draws, only relevant if seeds is None. @@ -109,7 +114,7 @@ def get_bootstrap_samples( indices = get_bootstrap_indices( data=data, rng=rng, - weights=weights, + weight_by=weight_by, cluster_by=cluster_by, n_draws=n_draws, ) diff --git a/tests/inference/test_bootstrap_ci.py b/tests/inference/test_bootstrap_ci.py index b5379688c..545c9e2fd 100644 --- a/tests/inference/test_bootstrap_ci.py +++ b/tests/inference/test_bootstrap_ci.py @@ -88,12 +88,12 @@ def test_check_inputs_data(): assert str(error.value) == expected_msg -def test_check_inputs_weights(setup): +def test_check_inputs_weight_by(setup): weights = "this is not a column name of df" - expected = "Input 'weights' must be None or a column name of 'data'." + expected = "Input 'weight_by' must be None or a column name of 'data'." with pytest.raises(ValueError) as error: - check_inputs(data=setup["df"], weights=weights) + check_inputs(data=setup["df"], weight_by=weights) assert str(error.value) == expected diff --git a/tests/inference/test_bootstrap_samples.py b/tests/inference/test_bootstrap_samples.py index 54da258bb..69c09221d 100644 --- a/tests/inference/test_bootstrap_samples.py +++ b/tests/inference/test_bootstrap_samples.py @@ -17,7 +17,7 @@ def data(): df = pd.DataFrame() df["id"] = np.arange(900) df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100 - df["wts"] = np.ones(900) + df["weights"] = np.ones(900) return df @@ -35,14 +35,14 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data): def test_get_bootstrap_indices_randomization_works_with_weights(data): rng = get_rng(seed=12345) - res = get_bootstrap_indices(data, weights="wts", n_draws=2, rng=rng) + res = get_bootstrap_indices(data, weight_by="weights", n_draws=2, rng=rng) assert set(res[0]) != set(res[1]) def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data): rng = get_rng(seed=12345) res = get_bootstrap_indices( - data, weights="wts", cluster_by="hh", n_draws=2, rng=rng + data, weight_by="weights", cluster_by="hh", n_draws=2, rng=rng ) assert set(res[0]) != set(res[1]) From a39bf150d6bee9626d53dede2367b6fc4fd88499 Mon Sep 17 00:00:00 2001 From: alanlujan91 Date: Tue, 21 May 2024 11:53:29 -0400 Subject: [PATCH 6/8] add _get_probs_for_bootstrap_indices --- src/estimagic/inference/bootstrap_samples.py | 70 +++++++++----------- tests/inference/test_bootstrap_samples.py | 17 +++++ 2 files changed, 49 insertions(+), 38 deletions(-) diff --git a/src/estimagic/inference/bootstrap_samples.py b/src/estimagic/inference/bootstrap_samples.py index c1fdfa15e..bb631908a 100644 --- a/src/estimagic/inference/bootstrap_samples.py +++ b/src/estimagic/inference/bootstrap_samples.py @@ -26,52 +26,46 @@ def get_bootstrap_indices( """ n_obs = len(data) + probs = _get_probs_for_bootstrap_indices(data, weight_by, cluster_by) - if weight_by is None: + if cluster_by is None: + bootstrap_indices = list( + rng.choice(n_obs, size=(n_draws, n_obs), replace=True, p=probs) + ) + else: + clusters = data[cluster_by].unique() + drawn_clusters = rng.choice( + clusters, size=(n_draws, len(clusters)), replace=True, p=probs + ) - if cluster_by is None: - bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs))) - else: - clusters = data[cluster_by].unique() - drawn_clusters = rng.choice( - clusters, size=(n_draws, len(clusters)), replace=True - ) + bootstrap_indices = _convert_cluster_ids_to_indices( + data[cluster_by], drawn_clusters + ) + + return bootstrap_indices - bootstrap_indices = _convert_cluster_ids_to_indices( - data[cluster_by], drawn_clusters - ) - else: +def _get_probs_for_bootstrap_indices(data, weight_by, cluster_by): + """Calculate probabilities for drawing bootstrap indices. + Args: + data (pandas.DataFrame): original dataset. + weight_by (str): column name of the variable with weights. + cluster_by (str): column name of the variable to cluster by. + + Returns: + list: numpy array with probabilities. + + """ + if weight_by is None: + probs = None + else: if cluster_by is None: probs = data[weight_by] / data[weight_by].sum() - bootstrap_indices = list( - rng.choice( - n_obs, - size=(n_draws, n_obs), - replace=True, - p=probs, - ) - ) else: - clusters_and_weights = ( - data.groupby(cluster_by)[weight_by].sum().reset_index() - ) - clusters = clusters_and_weights[cluster_by] - weights = clusters_and_weights[weight_by] - probs = weights / weights.sum() - drawn_clusters = rng.choice( - clusters, - size=(n_draws, len(clusters)), - replace=True, - p=probs, - ) - - bootstrap_indices = _convert_cluster_ids_to_indices( - data[cluster_by], drawn_clusters - ) - - return bootstrap_indices + cluster_weights = data.groupby(cluster_by, sort=False)[weight_by].sum() + probs = cluster_weights / cluster_weights.sum() + return probs def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters): diff --git a/tests/inference/test_bootstrap_samples.py b/tests/inference/test_bootstrap_samples.py index 69c09221d..7b9283f6a 100644 --- a/tests/inference/test_bootstrap_samples.py +++ b/tests/inference/test_bootstrap_samples.py @@ -47,6 +47,23 @@ def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(d assert set(res[0]) != set(res[1]) +def test_get_bootstrap_indices_randomization_works_with_and_without_weights(data): + rng1 = get_rng(seed=12345) + rng2 = get_rng(seed=12345) + res1 = get_bootstrap_indices(data, n_draws=1, rng=rng1) + res2 = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng2) + assert not np.array_equal(res1, res2) + + +def test_get_boostrap_indices_randomization_works_with_extreme_case(data): + rng = get_rng(seed=12345) + weights = np.zeros(900) + weights[0] = 1.0 + data["weights"] = weights + res = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng) + assert len(np.unique(res)) == 1 + + def test_clustering_leaves_households_intact(data): rng = get_rng(seed=12345) indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0] From 697177940a6ca14e1a51faf99f7105b9485b267e Mon Sep 17 00:00:00 2001 From: alanlujan91 Date: Fri, 11 Oct 2024 14:33:21 -0400 Subject: [PATCH 7/8] requested changes --- src/estimagic/bootstrap_samples.py | 12 +++-- tests/estimagic/test_bootstrap_ci.py | 29 +++++++++--- tests/estimagic/test_bootstrap_samples.py | 57 +++++++++++++++++++++-- 3 files changed, 84 insertions(+), 14 deletions(-) diff --git a/src/estimagic/bootstrap_samples.py b/src/estimagic/bootstrap_samples.py index bb631908a..7b5d66e29 100644 --- a/src/estimagic/bootstrap_samples.py +++ b/src/estimagic/bootstrap_samples.py @@ -26,7 +26,7 @@ def get_bootstrap_indices( """ n_obs = len(data) - probs = _get_probs_for_bootstrap_indices(data, weight_by, cluster_by) + probs = _calculate_bootstrap_indices_weights(data, weight_by, cluster_by) if cluster_by is None: bootstrap_indices = list( @@ -45,8 +45,12 @@ def get_bootstrap_indices( return bootstrap_indices -def _get_probs_for_bootstrap_indices(data, weight_by, cluster_by): - """Calculate probabilities for drawing bootstrap indices. +def _calculate_bootstrap_indices_weights(data, weight_by, cluster_by): + """Calculate weights for drawing bootstrap indices. + + If weights_by is not None and cluster_by is None, the weights are normalized to sum + to one. If weights_by and cluster_by are both not None, the weights are normalized + to sum to one within each cluster. Args: data (pandas.DataFrame): original dataset. @@ -54,7 +58,7 @@ def _get_probs_for_bootstrap_indices(data, weight_by, cluster_by): cluster_by (str): column name of the variable to cluster by. Returns: - list: numpy array with probabilities. + list: None or pd.Series of weights. """ if weight_by is None: diff --git a/tests/estimagic/test_bootstrap_ci.py b/tests/estimagic/test_bootstrap_ci.py index b86df2bf5..2a95733f0 100644 --- a/tests/estimagic/test_bootstrap_ci.py +++ b/tests/estimagic/test_bootstrap_ci.py @@ -3,10 +3,11 @@ import numpy as np import pandas as pd import pytest -from pybaum import tree_just_flatten - from estimagic.bootstrap_ci import calculate_ci, check_inputs +from estimagic.bootstrap_samples import get_bootstrap_indices from optimagic.parameters.tree_registry import get_registry +from optimagic.utilities import get_rng +from pybaum import tree_just_flatten def aaae(obj1, obj2, decimal=6): @@ -89,12 +90,26 @@ def test_check_inputs_data(): def test_check_inputs_weight_by(setup): - weights = "this is not a column name of df" - expected = "Input 'weight_by' must be None or a column name of 'data'." + expected_error_msg = "Input 'weight_by' must be None or a column name of 'data'." + with pytest.raises(ValueError, match=expected_error_msg): + check_inputs(data=setup["df"], weight_by="this is not a column name of df") - with pytest.raises(ValueError) as error: - check_inputs(data=setup["df"], weight_by=weights) - assert str(error.value) == expected + +def test_get_bootstrap_indices_heterogeneous_weights(): + data = pd.DataFrame( + {"id": [0, 1], "w_homogenous": [0.5, 0.5], "w_heterogenous": [0.1, 0.9]} + ) + + res_homogenous = get_bootstrap_indices( + data, weight_by="w_homogenous", n_draws=1_000, rng=get_rng(seed=0) + ) + res_heterogenous = get_bootstrap_indices( + data, weight_by="w_heterogenous", n_draws=1_000, rng=get_rng(seed=0) + ) + + # Given the weights, the first sample mean should be close to 0.5, + # while the second one should be close to 0.9 + assert np.mean(res_homogenous) < 0.75 < np.mean(res_heterogenous) def test_check_inputs_cluster_by(setup): diff --git a/tests/estimagic/test_bootstrap_samples.py b/tests/estimagic/test_bootstrap_samples.py index 4cfcbac06..2b7e9c545 100644 --- a/tests/estimagic/test_bootstrap_samples.py +++ b/tests/estimagic/test_bootstrap_samples.py @@ -1,16 +1,17 @@ import numpy as np import pandas as pd import pytest -from numpy.testing import assert_array_equal as aae -from pandas.testing import assert_frame_equal as afe - from estimagic.bootstrap_samples import ( + _calculate_bootstrap_indices_weights, _convert_cluster_ids_to_indices, _get_bootstrap_samples_from_indices, get_bootstrap_indices, get_bootstrap_samples, ) +from numpy.testing import assert_array_equal as aae from optimagic.utilities import get_rng +from pandas.testing import assert_frame_equal as afe +from pandas.testing import assert_series_equal as ase @pytest.fixture() @@ -95,3 +96,53 @@ def test_get_bootstrap_samples_from_indices(): def test_get_bootstrap_samples_runs(data): rng = get_rng(seed=12345) get_bootstrap_samples(data, n_draws=2, rng=rng) + + +@pytest.fixture +def sample_data(): + return pd.DataFrame({"weight": [1, 2, 3, 4], "cluster": ["A", "A", "B", "B"]}) + + +def test_no_weights_no_clusters(sample_data): + result = _calculate_bootstrap_indices_weights(sample_data, None, None) + assert result is None + + +def test_weights_no_clusters(sample_data): + result = _calculate_bootstrap_indices_weights(sample_data, "weight", None) + expected = pd.Series([0.1, 0.2, 0.3, 0.4], index=sample_data.index, name="weight") + pd.testing.assert_series_equal(result, expected) + + +def test_weights_and_clusters(sample_data): + result = _calculate_bootstrap_indices_weights(sample_data, "weight", "cluster") + expected = pd.Series( + [0.3, 0.7], index=pd.Index(["A", "B"], name="cluster"), name="weight" + ) + ase(result, expected) + + +def test_invalid_weight_column(): + data = pd.DataFrame({"x": [1, 2, 3]}) + with pytest.raises(KeyError): + _calculate_bootstrap_indices_weights(data, "weight", None) + + +def test_invalid_cluster_column(sample_data): + with pytest.raises(KeyError): + _calculate_bootstrap_indices_weights(sample_data, "weight", "invalid_cluster") + + +def test_empty_dataframe(): + empty_df = pd.DataFrame() + result = _calculate_bootstrap_indices_weights(empty_df, None, None) + assert result is None + + +def test_some_zero_weights_with_clusters(): + data = pd.DataFrame({"weight": [0, 1, 0, 2], "cluster": ["A", "A", "B", "B"]}) + result = _calculate_bootstrap_indices_weights(data, "weight", "cluster") + expected = pd.Series( + [1 / 3, 2 / 3], index=pd.Index(["A", "B"], name="cluster"), name="weight" + ) + ase(result, expected) From 1d45f278faf9ba35e96304c05a803ccd53bd59a9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Oct 2024 18:35:36 +0000 Subject: [PATCH 8/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/estimagic/test_bootstrap_ci.py | 3 ++- tests/estimagic/test_bootstrap_samples.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/estimagic/test_bootstrap_ci.py b/tests/estimagic/test_bootstrap_ci.py index 2a95733f0..64562438d 100644 --- a/tests/estimagic/test_bootstrap_ci.py +++ b/tests/estimagic/test_bootstrap_ci.py @@ -3,11 +3,12 @@ import numpy as np import pandas as pd import pytest +from pybaum import tree_just_flatten + from estimagic.bootstrap_ci import calculate_ci, check_inputs from estimagic.bootstrap_samples import get_bootstrap_indices from optimagic.parameters.tree_registry import get_registry from optimagic.utilities import get_rng -from pybaum import tree_just_flatten def aaae(obj1, obj2, decimal=6): diff --git a/tests/estimagic/test_bootstrap_samples.py b/tests/estimagic/test_bootstrap_samples.py index 2b7e9c545..4af80d502 100644 --- a/tests/estimagic/test_bootstrap_samples.py +++ b/tests/estimagic/test_bootstrap_samples.py @@ -1,6 +1,10 @@ import numpy as np import pandas as pd import pytest +from numpy.testing import assert_array_equal as aae +from pandas.testing import assert_frame_equal as afe +from pandas.testing import assert_series_equal as ase + from estimagic.bootstrap_samples import ( _calculate_bootstrap_indices_weights, _convert_cluster_ids_to_indices, @@ -8,10 +12,7 @@ get_bootstrap_indices, get_bootstrap_samples, ) -from numpy.testing import assert_array_equal as aae from optimagic.utilities import get_rng -from pandas.testing import assert_frame_equal as afe -from pandas.testing import assert_series_equal as ase @pytest.fixture()