Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bootstrap weights #485

Merged
merged 15 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/estimagic/estimation/msm_weighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def get_moments_cov(
moment_kwargs (dict): Additional keyword arguments for calculate_moments.
bootstrap_kwargs (dict): Additional keyword arguments that govern the
bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores",
"batch_evaluator", "cluster_by" and "error_handling". For details see the
bootstrap function.
"batch_evaluator", "weights", "cluster_by" and "error_handling".
For details see the bootstrap function.

Returns:
pandas.DataFrame or numpy.ndarray: The covariance matrix of the moment
Expand All @@ -39,6 +39,7 @@ def get_moments_cov(
"n_draws",
"seed",
"batch_evaluator",
"weights",
alanlujan91 marked this conversation as resolved.
Show resolved Hide resolved
"cluster_by",
"error_handling",
"existing_result",
Expand Down
5 changes: 4 additions & 1 deletion src/estimagic/inference/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def bootstrap(
existing_result=None,
outcome_kwargs=None,
n_draws=1_000,
weights=None,
cluster_by=None,
seed=None,
n_cores=1,
Expand All @@ -41,6 +42,7 @@ def bootstrap(
n_draws (int): Number of bootstrap samples to draw.
If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes
is used.
weights (str): Column name of variable with weights or None.
cluster_by (str): Column name of variable to cluster by or None.
seed (Union[None, int, numpy.random.Generator]): If seed is None or int the
numpy.random.default_rng is used seeded with seed. If seed is already a
Expand All @@ -59,7 +61,7 @@ def bootstrap(

"""
if callable(outcome):
check_inputs(data=data, cluster_by=cluster_by)
check_inputs(data=data, weights=weights, cluster_by=cluster_by)

if outcome_kwargs is not None:
outcome = functools.partial(outcome, **outcome_kwargs)
Expand All @@ -82,6 +84,7 @@ def bootstrap(
new_outcomes = get_bootstrap_outcomes(
data=data,
outcome=outcome,
weights=weights,
cluster_by=cluster_by,
rng=rng,
n_draws=n_draws - n_existing,
Expand Down
10 changes: 9 additions & 1 deletion src/estimagic/inference/bootstrap_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@


def check_inputs(
data=None, cluster_by=None, ci_method="percentile", ci_level=0.95, skipdata=False
data=None,
weights=None,
cluster_by=None,
ci_method="percentile",
ci_level=0.95,
skipdata=False,
):
"""Check validity of inputs.

Args:
data (pd.DataFrame): Dataset.
weights (str): Column name of variable with weights.
cluster_by (str): Column name of variable to cluster by.
ci_method (str): Method of choice for computing confidence intervals.
The default is "percentile".
Expand All @@ -21,6 +27,8 @@ def check_inputs(
if not skipdata:
if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series):
raise TypeError("Data must be a pandas.DataFrame or pandas.Series.")
elif (weights is not None) and (weights not in data.columns.tolist()):
raise ValueError("Input 'weights' must be None or a column name of 'data'.")
elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()):
raise ValueError(
"Input 'cluster_by' must be None or a column name of 'data'."
Expand Down
5 changes: 4 additions & 1 deletion src/estimagic/inference/bootstrap_outcomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
def get_bootstrap_outcomes(
data,
outcome,
weights=None,
cluster_by=None,
rng=None,
n_draws=1000,
Expand All @@ -19,6 +20,7 @@ def get_bootstrap_outcomes(
data (pandas.DataFrame): original dataset.
outcome (callable): function of the dataset calculating statistic of interest.
Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.).
weights (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.
rng (numpy.random.Generator): A random number generator.
n_draws (int): number of bootstrap draws.
Expand All @@ -34,12 +36,13 @@ def get_bootstrap_outcomes(
estimates (list): List of pytrees of estimated bootstrap outcomes.

"""
check_inputs(data=data, cluster_by=cluster_by)
check_inputs(data=data, weights=weights, cluster_by=cluster_by)
batch_evaluator = process_batch_evaluator(batch_evaluator)

indices = get_bootstrap_indices(
data=data,
rng=rng,
weights=weights,
cluster_by=cluster_by,
n_draws=n_draws,
)
Expand Down
65 changes: 54 additions & 11 deletions src/estimagic/inference/bootstrap_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
import pandas as pd


def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
def get_bootstrap_indices(
data,
rng,
weights=None,
cluster_by=None,
n_draws=1000,
):
"""Draw positional indices for the construction of bootstrap samples.

Storing the positional indices instead of the full bootstrap samples saves a lot
Expand All @@ -11,6 +17,7 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
Args:
data (pandas.DataFrame): original dataset.
rng (numpy.random.Generator): A random number generator.
weights (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.
n_draws (int): number of draws, only relevant if seeds is None.

Expand All @@ -19,17 +26,45 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):

"""
n_obs = len(data)
if cluster_by is None:
bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))

if weights is None:

if cluster_by is None:
bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))
else:
clusters = data[cluster_by].unique()
drawn_clusters = rng.choice(
clusters, size=(n_draws, len(clusters)), replace=True
)

bootstrap_indices = _convert_cluster_ids_to_indices(
data[cluster_by], drawn_clusters
)

else:
clusters = data[cluster_by].unique()
drawn_clusters = rng.choice(
clusters, size=(n_draws, len(clusters)), replace=True
)

bootstrap_indices = _convert_cluster_ids_to_indices(
data[cluster_by], drawn_clusters
)
if cluster_by is None:
bootstrap_indices = list(
rng.choice(
n_obs,
size=(n_draws, n_obs),
replace=True,
p=data[weights] / data[weights].sum(),
)
)
timmens marked this conversation as resolved.
Show resolved Hide resolved
else:
clusters = data.groupby(cluster_by)[weights].sum().reset_index()

drawn_clusters = rng.choice(
clusters[cluster_by],
size=(n_draws, len(clusters)),
replace=True,
p=clusters[weights] / clusters[weights].sum(),
)

bootstrap_indices = _convert_cluster_ids_to_indices(
data[cluster_by], drawn_clusters
)
timmens marked this conversation as resolved.
Show resolved Hide resolved

return bootstrap_indices

Expand All @@ -48,7 +83,13 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
return bootstrap_indices


def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
def get_bootstrap_samples(
data,
rng,
weights=None,
cluster_by=None,
n_draws=1000,
):
"""Draw bootstrap samples.

If you have memory issues you should use get_bootstrap_indices instead and construct
Expand All @@ -57,6 +98,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
Args:
data (pandas.DataFrame): original dataset.
rng (numpy.random.Generator): A random number generator.
weights (str): weights for the observations.
cluster_by (str): column name of the variable to cluster by.
n_draws (int): number of draws, only relevant if seeds is None.

Expand All @@ -67,6 +109,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
indices = get_bootstrap_indices(
data=data,
rng=rng,
weights=weights,
cluster_by=cluster_by,
n_draws=n_draws,
)
Expand Down
9 changes: 9 additions & 0 deletions tests/inference/test_bootstrap_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ def test_check_inputs_data():
assert str(error.value) == expected_msg


def test_check_inputs_weights(setup):
weights = "this is not a column name of df"
expected = "Input 'weights' must be None or a column name of 'data'."

with pytest.raises(ValueError) as error:
check_inputs(data=setup["df"], weights=weights)
assert str(error.value) == expected


def test_check_inputs_cluster_by(setup):
cluster_by = "this is not a column name of df"
expected_msg = "Input 'cluster_by' must be None or a column name of 'data'."
Expand Down
15 changes: 15 additions & 0 deletions tests/inference/test_bootstrap_samples.py
timmens marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def data():
df = pd.DataFrame()
df["id"] = np.arange(900)
df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100
df["wts"] = np.ones(900)
return df


Expand All @@ -32,6 +33,20 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data):
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_weights(data):
rng = get_rng(seed=12345)
res = get_bootstrap_indices(data, weights="wts", n_draws=2, rng=rng)
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data):
rng = get_rng(seed=12345)
res = get_bootstrap_indices(
data, weights="wts", cluster_by="hh", n_draws=2, rng=rng
)
assert set(res[0]) != set(res[1])


def test_clustering_leaves_households_intact(data):
rng = get_rng(seed=12345)
indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0]
Expand Down
Loading