optimagic-dev · timmens · Nov 20, 2024 · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/src/estimagic/estimation/msm_weighting.py b/src/estimagic/estimation/msm_weighting.py
@@ -24,8 +24,8 @@ def get_moments_cov(
         moment_kwargs (dict): Additional keyword arguments for calculate_moments.
         bootstrap_kwargs (dict): Additional keyword arguments that govern the
             bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores",
-            "batch_evaluator", "cluster_by" and "error_handling". For details see the
-            bootstrap function.
+            "batch_evaluator", "weights", "cluster_by" and "error_handling".
+            For details see the bootstrap function.
 
     Returns:
         pandas.DataFrame or numpy.ndarray: The covariance matrix of the moment
@@ -39,6 +39,7 @@ def get_moments_cov(
         "n_draws",
         "seed",
         "batch_evaluator",
+        "weights",
         "cluster_by",
         "error_handling",
         "existing_result",

diff --git a/src/estimagic/inference/bootstrap.py b/src/estimagic/inference/bootstrap.py
@@ -24,6 +24,7 @@ def bootstrap(
     existing_result=None,
     outcome_kwargs=None,
     n_draws=1_000,
+    weights=None,
     cluster_by=None,
     seed=None,
     n_cores=1,
@@ -41,6 +42,7 @@ def bootstrap(
         n_draws (int): Number of bootstrap samples to draw.
             If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes
             is used.
+        weights (str): Column name of variable with weights or None.
         cluster_by (str): Column name of variable to cluster by or None.
         seed (Union[None, int, numpy.random.Generator]): If seed is None or int the
             numpy.random.default_rng is used seeded with seed. If seed is already a
@@ -59,7 +61,7 @@ def bootstrap(
 
     """
     if callable(outcome):
-        check_inputs(data=data, cluster_by=cluster_by)
+        check_inputs(data=data, weights=weights, cluster_by=cluster_by)
 
         if outcome_kwargs is not None:
             outcome = functools.partial(outcome, **outcome_kwargs)
@@ -82,6 +84,7 @@ def bootstrap(
         new_outcomes = get_bootstrap_outcomes(
             data=data,
             outcome=outcome,
+            weights=weights,
             cluster_by=cluster_by,
             rng=rng,
             n_draws=n_draws - n_existing,

diff --git a/src/estimagic/inference/bootstrap_helpers.py b/src/estimagic/inference/bootstrap_helpers.py
@@ -2,12 +2,18 @@
 
 
 def check_inputs(
-    data=None, cluster_by=None, ci_method="percentile", ci_level=0.95, skipdata=False
+    data=None,
+    weights=None,
+    cluster_by=None,
+    ci_method="percentile",
+    ci_level=0.95,
+    skipdata=False,
 ):
     """Check validity of inputs.
 
     Args:
         data (pd.DataFrame): Dataset.
+        weights (str): Column name of variable with weights.
         cluster_by (str): Column name of variable to cluster by.
         ci_method (str): Method of choice for computing confidence intervals.
             The default is "percentile".
@@ -21,6 +27,8 @@ def check_inputs(
     if not skipdata:
         if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series):
             raise TypeError("Data must be a pandas.DataFrame or pandas.Series.")
+        elif (weights is not None) and (weights not in data.columns.tolist()):
+            raise ValueError("Input 'weights' must be None or a column name of 'data'.")
         elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()):
             raise ValueError(
                 "Input 'cluster_by' must be None or a column name of 'data'."

diff --git a/src/estimagic/inference/bootstrap_outcomes.py b/src/estimagic/inference/bootstrap_outcomes.py
@@ -6,6 +6,7 @@
 def get_bootstrap_outcomes(
     data,
     outcome,
+    weights=None,
     cluster_by=None,
     rng=None,
     n_draws=1000,
@@ -19,6 +20,7 @@ def get_bootstrap_outcomes(
         data (pandas.DataFrame): original dataset.
         outcome (callable): function of the dataset calculating statistic of interest.
             Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.).
+        weights (str): column name of the variable with weights.
         cluster_by (str): column name of the variable to cluster by.
         rng (numpy.random.Generator): A random number generator.
         n_draws (int): number of bootstrap draws.
@@ -34,12 +36,13 @@ def get_bootstrap_outcomes(
         estimates (list):  List of pytrees of estimated bootstrap outcomes.
 
     """
-    check_inputs(data=data, cluster_by=cluster_by)
+    check_inputs(data=data, weights=weights, cluster_by=cluster_by)
     batch_evaluator = process_batch_evaluator(batch_evaluator)
 
     indices = get_bootstrap_indices(
         data=data,
         rng=rng,
+        weights=weights,
         cluster_by=cluster_by,
         n_draws=n_draws,
     )

diff --git a/src/estimagic/inference/bootstrap_samples.py b/src/estimagic/inference/bootstrap_samples.py
@@ -2,7 +2,13 @@
 import pandas as pd
 
 
-def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
+def get_bootstrap_indices(
+    data,
+    rng,
+    weights=None,
+    cluster_by=None,
+    n_draws=1000,
+):
     """Draw positional indices for the construction of bootstrap samples.
 
     Storing the positional indices instead of the full bootstrap samples saves a lot
@@ -11,6 +17,7 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
     Args:
         data (pandas.DataFrame): original dataset.
         rng (numpy.random.Generator): A random number generator.
+        weights (str): column name of the variable with weights.
         cluster_by (str): column name of the variable to cluster by.
         n_draws (int): number of draws, only relevant if seeds is None.
 
@@ -19,17 +26,45 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
 
     """
     n_obs = len(data)
-    if cluster_by is None:
-        bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))
+
+    if weights is None:
+
+        if cluster_by is None:
+            bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))
+        else:
+            clusters = data[cluster_by].unique()
+            drawn_clusters = rng.choice(
+                clusters, size=(n_draws, len(clusters)), replace=True
+            )
+
+            bootstrap_indices = _convert_cluster_ids_to_indices(
+                data[cluster_by], drawn_clusters
+            )
+
     else:
-        clusters = data[cluster_by].unique()
-        drawn_clusters = rng.choice(
-            clusters, size=(n_draws, len(clusters)), replace=True
-        )
 
-        bootstrap_indices = _convert_cluster_ids_to_indices(
-            data[cluster_by], drawn_clusters
-        )
+        if cluster_by is None:
+            bootstrap_indices = list(
+                rng.choice(
+                    n_obs,
+                    size=(n_draws, n_obs),
+                    replace=True,
+                    p=data[weights] / data[weights].sum(),
+                )
+            )
+        else:
+            clusters = data.groupby(cluster_by)[weights].sum().reset_index()
+
+            drawn_clusters = rng.choice(
+                clusters[cluster_by],
+                size=(n_draws, len(clusters)),
+                replace=True,
+                p=clusters[weights] / clusters[weights].sum(),
+            )
+
+            bootstrap_indices = _convert_cluster_ids_to_indices(
+                data[cluster_by], drawn_clusters
+            )
 
     return bootstrap_indices
 
@@ -48,7 +83,13 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
     return bootstrap_indices
 
 
-def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
+def get_bootstrap_samples(
+    data,
+    rng,
+    weights=None,
+    cluster_by=None,
+    n_draws=1000,
+):
     """Draw bootstrap samples.
 
     If you have memory issues you should use get_bootstrap_indices instead and construct
@@ -57,6 +98,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
     Args:
         data (pandas.DataFrame): original dataset.
         rng (numpy.random.Generator): A random number generator.
+        weights (str): weights for the observations.
         cluster_by (str): column name of the variable to cluster by.
         n_draws (int): number of draws, only relevant if seeds is None.
 
@@ -67,6 +109,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
     indices = get_bootstrap_indices(
         data=data,
         rng=rng,
+        weights=weights,
         cluster_by=cluster_by,
         n_draws=n_draws,
     )

diff --git a/tests/inference/test_bootstrap_ci.py b/tests/inference/test_bootstrap_ci.py
@@ -88,6 +88,15 @@ def test_check_inputs_data():
     assert str(error.value) == expected_msg
 
 
+def test_check_inputs_weights(setup):
+    weights = "this is not a column name of df"
+    expected = "Input 'weights' must be None or a column name of 'data'."
+
+    with pytest.raises(ValueError) as error:
+        check_inputs(data=setup["df"], weights=weights)
+    assert str(error.value) == expected
+
+
 def test_check_inputs_cluster_by(setup):
     cluster_by = "this is not a column name of df"
     expected_msg = "Input 'cluster_by' must be None or a column name of 'data'."

diff --git a/tests/inference/test_bootstrap_samples.py b/tests/inference/test_bootstrap_samples.py
@@ -17,6 +17,7 @@ def data():
     df = pd.DataFrame()
     df["id"] = np.arange(900)
     df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100
+    df["wts"] = np.ones(900)
     return df
 
 
@@ -32,6 +33,20 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data):
     assert set(res[0]) != set(res[1])
 
 
+def test_get_bootstrap_indices_randomization_works_with_weights(data):
+    rng = get_rng(seed=12345)
+    res = get_bootstrap_indices(data, weights="wts", n_draws=2, rng=rng)
+    assert set(res[0]) != set(res[1])
+
+
+def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data):
+    rng = get_rng(seed=12345)
+    res = get_bootstrap_indices(
+        data, weights="wts", cluster_by="hh", n_draws=2, rng=rng
+    )
+    assert set(res[0]) != set(res[1])
+
+
 def test_clustering_leaves_households_intact(data):
     rng = get_rng(seed=12345)
     indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0]