update all functions to only take SE, change tests and documentation …

…accordingly
ihmeuw-msca · Aug 5, 2024 · c380e63 · c380e63
1 parent 46c65b8
commit c380e63
Show file tree

Hide file tree

Showing 5 changed files with 176 additions and 102 deletions.
diff --git a/docs/user_guide/percentage_change.rst → docs/user_guide/bivariate.rst b/docs/user_guide/percentage_change.rst → docs/user_guide/bivariate.rst
@@ -28,7 +28,18 @@ in various state counties. The data may look something like the following,
 
 and our goal is to find the percentage change in the prevalence of cancer with its appropriate SE.
 
-The first step is to import the required function from the distrx package.
+Since we have counts and distrx expects mean/standard error (SE), we must first convert the data
+appropriately. Counts data is common at IHME, so a function is provided to return sample mean and
+SE given incidence count and sample size. We can import it and save the necessary variables like so.
+
+.. code-block:: python
+
+    from distrx import process_counts
+    mu_x, sigma_x = process_counts(cases_1, sample_1)
+    mu_y, sigma_y = process_counts(cases_2, sample_2)
+
+
+Then, we can import the required function from the distrx package.
 
 .. code-block:: python
 
@@ -39,13 +50,12 @@ transform you would like to apply to your data. In this case, it is the followin
 
 .. code-block:: python
 
-    mu_tx, sigma_tx = transform_bivariate(c_x=df["cases_1"],
-                                          n_x=df["sample_1"],
-                                          c_y=df["cases_2"],
-                                          n_y=df["sample_2"],
+    mu_tx, sigma_tx = transform_bivariate(mu_x=mu_x,
+                                          sigma_x=sigma_x,
+                                          mu_y=mu_y,
+                                          sigma_y=sigma_y,
                                           transform="percentage_change")
 
 ``mu_tx`` and ``sigma_tx`` are simply the percentage change for each county and their corresponding
-standard errors, respectively. ``sigma_tx`` has already been scaled the appropriate sample size so
-we **should not** scale it additionally with some function of othe sample size to obtain a
-confidence interval.
+standard errors, respectively. If a CI for the mean is desired, simply use
+``mu_tx +/- Q * sigma_tx``.
diff --git a/docs/user_guide/simple_transformations.rst → docs/user_guide/univariate.rst b/docs/user_guide/simple_transformations.rst → docs/user_guide/univariate.rst
@@ -16,11 +16,11 @@ order Taylor expansion of the transformation function.
 Example: Log Transform
 ----------------------
 
-Suppose that we have some means and standard errors (SEs) of systolic blood pressure (SBP) from
+Suppose that we have some means and standard deviations (SDs) of systolic blood pressure (SBP) from
 several different samples. The data may look something like the following,
 
 .. csv-table::
-   :header: mean, se, n
+   :header: mean, SD, n
    :widths: 10, 10, 10
    :align: center
 
@@ -30,9 +30,18 @@ several different samples. The data may look something like the following,
    124, 15, 226
    134, 7, 509
 
-and our goal is to obtain the appropriate SEs for the data after applying the log transform.
+and our goal is to obtain the appropriate standard errors (SEs) for the mean after applying the log
+transform.
 
-The first step is to import the required function from the distrx package.
+Since we are interested in the transformed SEs and *not* the transformed SDs, we must provide the
+SEs to distrx. **If you already have SEs and are performing the same task, you should skip this
+step!**
+
+.. code-block:: python
+
+    df["SE"] = df["SD"] / df["n"]
+
+Now, import the appropriate function from distrx.
 
 .. code-block:: python
 
@@ -44,10 +53,9 @@ transform you would like to apply to your data. In this case, it is the followin
 .. code-block:: python
 
     mu_tx, sigma_tx = transform_univariate(mu=df["means"],
-                                           sigma=df["se"],
-                                           n=df["n"],
+                                           sigma=df["SE"],
                                            transform="log")
 
 ``mu_tx`` and ``sigma_tx`` are simply the means with the transformation function applied and their
-corresponding standard errors, respectively. ``sigma_tx`` has already been scaled by :math:`\sqrt{n}`
-so the we **should not** scale it by square root of the sample size to obtain a confidence interval.
+appropriately transformed standard errors, respectively. If a CI for the mean is desired, simply
+use ``mu_tx +/- Q * sigma_tx``.
diff --git a/simulations.ipynb b/simulations.ipynb
diff --git a/src/distrx/transforms.py b/src/distrx/transforms.py
@@ -65,57 +65,37 @@ def __init__(self, operation: str) -> None:
 
     def __call__(
         self,
-        c_x: npt.ArrayLike,
-        n_x: npt.ArrayLike,
-        c_y: npt.ArrayLike,
-        n_y: npt.ArrayLike,
-        counts: bool = True,
+        mu_x: npt.ArrayLike,
+        sigma_x: npt.ArrayLike,
+        mu_y: npt.ArrayLike,
+        sigma_y: npt.ArrayLike,
     ) -> Tuple[np.ndarray, np.ndarray]:
-        if counts:
-            mu_x, sigma2_x, mu_y, sigma2_y = self.process_counts(
-                c_x, n_x, c_y, n_y
-            )
-        else:
-            raise NotImplementedError
-
         match self.operation:
             case "sum":
-                return self.sum(mu_x, sigma2_x, n_x, mu_y, sigma2_y, n_y)
-                # return self.percentage_change_trans(c_x, n_x, c_y, n_y)
+                return self.sum(mu_x, sigma_x, mu_y, sigma_y)
             case "difference":
-                return self.diff(mu_x, sigma2_x, n_x, mu_y, sigma2_y, n_y)
+                return self.diff(mu_x, sigma_x, mu_y, sigma_y)
             case "product":
-                return self.prod(mu_x, sigma2_x, n_x, mu_y, sigma2_y, n_y)
+                return self.prod(mu_x, sigma_x, mu_y, sigma_y)
             case "quotient":
-                return self.quotient(mu_x, sigma2_x, n_x, mu_y, sigma2_y, n_y)
+                return self.quotient(mu_x, sigma_x, mu_y, sigma_y)
             case _:
                 raise ValueError(f"Invalid transform '{self.transform}'.")
 
-    def process_counts(self, c_x, n_x, c_y, n_y):
-        mu_x = c_x / n_x
-        mu_y = c_y / n_y
-        sigma2_x = (c_x * (1 - mu_x) ** 2 + (n_x - c_x) * mu_x**2) / (n_x - 1)
-        sigma2_y = (c_y * (1 - mu_y) ** 2 + (n_y - c_y) * mu_y**2) / (n_y - 1)
-
-        return mu_x, sigma2_x, mu_y, sigma2_y
-
-    def sum(self, mu_x, sigma2_x, n_x, mu_y, sigma2_y, n_y):
-        sigma2_tx = sigma2_x / n_x + sigma2_y / n_y
+    def sum(self, mu_x, sigma_x, mu_y, sigma_y):
+        sigma2_tx = sigma_x**2 + sigma_y**2
         return mu_x + mu_y, np.sqrt(sigma2_tx)
 
-    def diff(self, mu_x, sigma2_x, n_x, mu_y, sigma2_y, n_y):
-        sigma2_tx = sigma2_x / n_x + sigma2_y / n_y
+    def diff(self, mu_x, sigma_x, mu_y, sigma_y):
+        sigma2_tx = sigma_x**2 + sigma_y**2
         return mu_x - mu_y, np.sqrt(sigma2_tx)
 
-    def prod(self, mu_x, sigma2_x, n_x, mu_y, sigma2_y, n_y):
-        sigma2_tx = mu_y**2 * sigma2_x / n_x + mu_x**2 * sigma2_y / n_y
+    def prod(self, mu_x, sigma_x, mu_y, sigma_y):
+        sigma2_tx = mu_y**2 * sigma_x**2 + mu_x**2 * sigma_y**2
         return mu_x * mu_y, np.sqrt(sigma2_tx)
 
-    def quotient(self, mu_x, sigma2_x, n_x, mu_y, sigma2_y, n_y):
-        sigma2_tx = (sigma2_y / (n_y * mu_x**2)) + (
-            mu_y**2 * sigma2_x / (n_x * mu_x**4)
-        )
-
+    def quotient(self, mu_x, sigma_x, mu_y, sigma_y):
+        sigma2_tx = (sigma_y**2 / mu_x**2) + (mu_y**2 * sigma_x**2 / mu_x**4)
         return mu_y / mu_x, np.sqrt(sigma2_tx)
 
 
@@ -152,7 +132,7 @@ def transform_univariate(
 
     """
 
-    mu, sigma = np.array(mu), np.array(sigma)
+    mu, sigma = np.atleast_1d(np.array(mu)), np.atleast_1d(np.array(sigma))
     _check_input(mu, sigma)
     match method:
         case "delta":
@@ -163,10 +143,10 @@ def transform_univariate(
 
 
 def transform_bivariate(
-    c_x: npt.ArrayLike,
-    n_x: npt.ArrayLike,
-    c_y: npt.ArrayLike,
-    n_y: npt.ArrayLike,
+    mu_x: npt.ArrayLike,
+    sigma_x: npt.ArrayLike,
+    mu_y: npt.ArrayLike,
+    sigma_y: npt.ArrayLike,
     transform: str,
     method: str = "delta",
 ) -> Tuple[np.ndarray, np.ndarray]:
@@ -203,29 +183,37 @@ def transform_bivariate(
         _description_
     """
 
-    c_x, n_x, c_y, n_y = (
-        np.array(c_x),
-        np.array(n_x),
-        np.array(c_y),
-        np.array(n_y),
+    mu_x, sigma_x, mu_y, sigma_y = (
+        np.array(mu_x),
+        np.array(sigma_x),
+        np.array(mu_y),
+        np.array(sigma_y),
     )
     match method:
         case "delta":
             match transform:
                 case "percentage_change":
                     pest, sigma = FirstOrderBivariate("quotient")(
-                        c_x, n_x, c_y, n_y
+                        mu_x, sigma_x, mu_y, sigma_y
                     )
                     return pest - 1, sigma
                 case "ratio":
                     pest, sigma = FirstOrderBivariate("quotient")(
-                        c_x, n_x, c_y, n_y
+                        mu_x, sigma_x, mu_y, sigma_y
                     )
                     return pest, sigma
         case _:
             raise ValueError(f"Invalid method '{method}'.")
 
 
+### HELPER FUNCTIONS
+def process_counts(count, n):
+    mu = count / n
+    sigma2 = (count * (1 - mu) ** 2 + (n - count) * mu**2) / (n - 1)
+
+    return mu, np.sqrt(sigma2 / n)
+
+
 def _check_input(mu: npt.ArrayLike, sigma: npt.ArrayLike) -> None:
     """Run checks on input data.
 

diff --git a/tests/test_transforms.py b/tests/test_transforms.py
@@ -4,7 +4,11 @@
 import pandas as pd
 import pytest
 
-from distrx.transforms import transform_bivariate, transform_univariate
+from distrx.transforms import (
+    process_counts,
+    transform_bivariate,
+    transform_univariate,
+)
 
 UNIVARIATE_TRANSFORM_DICT = {
     "log": [np.log, lambda x: 1.0 / x],
@@ -90,23 +94,30 @@ def test_delta_result(transform):
     assert np.allclose(sigma_trans, sigma_ref)
 
 
-# TODO: DEPRECATE
-# def test_percentage_change():
-#     x = np.random.normal(1, 0.1, 1000)
-#     y = np.random.normal(1.1, 0.1, 1000)
-#     z = np.random.normal(1, 0.1, 1001)
-#     p, sigma = transform_percentage_change_experiment(x, y)
-#     assert 0 < p and p < 1
-#     assert 0 < sigma and sigma < 1
-#     with pytest.raises(ValueError):
-#         transform_percentage_change_experiment(x, z)
+# TODO: MAYBE DO NOT DEPRECATE
+def test_percentage_change():
+    x = np.random.normal(1, 0.1, 1000)
+    y = np.random.normal(1.1, 0.1, 1000)
+    # z = np.random.normal(1, 0.1, 1001)
+    p, sigma = transform_bivariate(
+        np.mean(x),
+        np.std(x) / np.sqrt(len(x)),
+        np.mean(y),
+        np.std(y) / np.sqrt(len(y)),
+        "percentage_change",
+    )
+    assert -1 < p and p < np.inf
+    assert 0 < sigma and sigma < 1
 
 
 def test_percentage_change_counts():
     x = np.random.choice([0, 1], size=1000, p=[0.1, 0.9])
     y = np.random.choice([0, 1], size=1100, p=[0.2, 0.8])
+
+    mu_x, sigma_x = process_counts((x == 1).sum(), len(x))
+    mu_y, sigma_y = process_counts((y == 1).sum(), len(y))
     mu, sigma = transform_bivariate(
-        (x == 1).sum(), len(x), (y == 1).sum(), len(y), "percentage_change"
+        mu_x, sigma_x, mu_y, sigma_y, "percentage_change"
     )
     assert -1 <= mu and mu < np.inf
     assert 0 < sigma and sigma < 1