From 4350f95cd008daed53ba9c01560c67bf83c37158 Mon Sep 17 00:00:00 2001
From: Ian Czekala <iancze@gmail.com>
Date: Fri, 29 Dec 2023 13:00:32 +0000
Subject: [PATCH] consolidated loss function math into API.

---
 docs/changelog.md  |   1 +
 docs/conf.py       |  11 +-
 docs/index.md      |   2 +-
 docs/rml_intro.md  | 279 ++++-----------------------------------------
 pyproject.toml     |   2 +-
 src/mpol/losses.py |  50 ++------
 6 files changed, 43 insertions(+), 302 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index b722c634..14ac96d7 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -3,6 +3,7 @@
 # Changelog
 
 ## v0.3.0
+- Edited documentation to be more concise.
 - Added the {meth}`mpol.losses.neg_log_likelihood_avg` method to be used in point-estimate or optimization situations where data amplitudes or weights may be adjusted as part of the optimization (such as via self-calibration).
 - Renamed `mpol.losses.nll` -> {meth}`mpol.losses.r_chi_squared` and `mpol.losses.nll_gridded` -> {meth}`mpol.losses.r_chi_squared_gridded` because that is what those routines were previously calculating (see the {ref}`api-reference-label` for more details). ([#237](https://github.com/MPoL-dev/MPoL/issues/237)). Tutorials have also been updated to reflect the change. 
 - Fixed implementation and docstring of {meth}`mpol.losses.log_likelihood` ([#237](https://github.com/MPoL-dev/MPoL/issues/237)).
diff --git a/docs/conf.py b/docs/conf.py
index 538deb47..b9d2f9d7 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -45,7 +45,10 @@
 
 autodoc_mock_imports = ["torch", "torchvision"]
 autodoc_member_order = "bysource"
-autodoc_default_options = {"members": None}
+# https://github.com/sphinx-doc/sphinx/issues/9709
+# bug that if we set this here, we can't list individual members in the 
+# actual API doc
+# autodoc_default_options = {"members": None}
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
@@ -94,5 +97,9 @@
 nb_execution_raise_on_error = True
 # .ipynb are produced using Makefile on own terms,
 # # both .md and executed .ipynb are kept in git repo
-nb_execution_excludepatterns = ["large-tutorials/*.md", "large-tutorials/*.ipynb", "**.ipynb_checkpoints"]
+nb_execution_excludepatterns = [
+    "large-tutorials/*.md",
+    "large-tutorials/*.ipynb",
+    "**.ipynb_checkpoints",
+]
 myst_heading_anchors = 3
diff --git a/docs/index.md b/docs/index.md
index 5cff3c55..5ff20c4e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -20,6 +20,7 @@ If you'd like to help build the MPoL package, please check out the {ref}`develop
 :maxdepth: 2
 
 rml_intro.md
+introduction.md
 installation.md
 ci-tutorials/PyTorch
 ci-tutorials/gridder
@@ -38,7 +39,6 @@ large-tutorials/pyro
 :caption: API
 :maxdepth: 2
 
-api/index
 api/coordinates
 api/datasets
 api/fourier
diff --git a/docs/rml_intro.md b/docs/rml_intro.md
index 633223ed..2871e0d7 100644
--- a/docs/rml_intro.md
+++ b/docs/rml_intro.md
@@ -4,266 +4,7 @@
 
 This document is an attempt to provide a whirlwind introduction to what Regularized Maximum Likelihood (RML) imaging is, and why you might want to use this MPoL package to perform it with your interferometric dataset. Of course, the field is rich, varied, and this short introduction couldn't possibly do justice to cover the topic in depth. We recommend that you check out many of the links and suggestions in this document for further reading and understanding.
 
-## Introduction to Likelihood functions
 
-```{margin} Intro reading
-If you are new to Bayesian inference or its notation, we recommend reading [Eadie et al. 2023](https://ui.adsabs.harvard.edu/abs/2023arXiv230204703E/abstract) and [Hogg 2012](https://ui.adsabs.harvard.edu/abs/2012arXiv1205.4446H/abstract).
-```
-
-Typically, when astronomers fit a model to some dataset, such as a line $y = m x + b$ to a collection of $\boldsymbol{X} = \{x_1, x_2, \ldots\, x_N\}$ and $\boldsymbol{Y} = \{y_1, y_2, \ldots\, y_N\}$ points, we require a likelihood function. Simply put, the likelihood function specifies the probability of the data, given a model, and encapsulates our assumptions about the data and noise generating processes.
-
-For most real-world datasets, we don't measure the "true" $y$ value of the line (i.e., $mx + b$), but rather make a measurement which has been partially corrupted by some "noise." In that case, we say that each $y_i$ data point is actually generated by
-
-$$
-y_i = m x_i + b + \epsilon
-$$
-
-where $\epsilon$ is a noise realization from a standard [normal distribution](https://en.wikipedia.org/wiki/Normal_distribution) with standard deviation $\sigma$, i.e.,
-
-$$
-\epsilon \sim \mathcal{N}(0, \sigma).
-$$
-
-This information about the data and noise generating process means that we can write down a likelihood function to calculate the probability of the data, given a set of model parameters. The likelihood function is $p(\boldsymbol{Y} |\,\boldsymbol{\theta})$. Sometimes it is written as $\mathcal{L}(\boldsymbol{\theta}; \boldsymbol{Y})$ to emphasize that it is in fact a *function* (note, not probability) of the model parameters, and that it takes the data as an argument. Frequently, when employed in computation, we'll use the logarithm of the likelihood function, or "log-likelihood," $\ln \mathcal{L}$ to avoid numerical under/overflow issues. Let's call $\boldsymbol{\theta} = \{m, b\}$ and $M(x_i |\, \boldsymbol{\theta}) = m x_i + b$ to emphasize that we can use the model $M$ to calculate a response at input $x_i$ conditional on model parameters $\boldsymbol{\theta}$. The likelihood function for this line problem is
-
-$$
-\mathcal{L}(\boldsymbol{\theta}; \boldsymbol{Y}) = \prod_i^N \frac{1}{\sqrt{2 \pi} \sigma} \exp \left [ - \frac{(y_i - M(x_i |\,\boldsymbol{\theta}))^2}{2 \sigma^2}\right ]
-$$
-
-The logarithm of the likelihood function is
-
-$$
-\ln \mathcal{L}(\boldsymbol{\theta}; \boldsymbol{Y}) = -N \ln(\sqrt{2 \pi} \sigma) - \frac{1}{2} \sum_i^N \frac{(y_i - M(x_i |\,\boldsymbol{\theta}))^2}{\sigma^2}
-$$
-
-You may recognize the right hand term looks similar to the $\chi^2$ metric,
-
-$$
-\chi^2(\boldsymbol{\theta}; \boldsymbol{Y}) = \sum_i^N \frac{(y_i - M(x_i |\,\boldsymbol{\theta}))^2}{\sigma^2}
-$$
-
-Assuming that the uncertainty ($\sigma$) on each data point is known (and remains constant), the first term in the log likelihood expression remains constant, and we have
-
-$$
-\ln \mathcal{L}(\boldsymbol{\theta}; \boldsymbol{Y}) = - \frac{1}{2} \chi^2 (\boldsymbol{\theta}; \boldsymbol{Y}) + C
-$$
-
-where $C$ is a constant with respect to the model parameters. It is common to use shorthand to say that "the likelihood function is $\chi^2$" to indicate situations where the data uncertainties are Gaussian. Very often, we (or others) are interested in the parameter values $\boldsymbol{\theta}_\mathrm{MLE}$ which maximize the likelihood function. Unsurprisingly, these parameters are called the *maximum likelihood estimate* (or MLE), and usually they represent something like a "best-fit" model. [^mle-solution]
-
-When it comes time to do parameter inference, however, it's important to keep in mind
-
-1. the simplifying assumptions we made about the noise uncertainties being constant with respect to the model parameters. If we were to "fit for the noise" in a hierarchical model, for example, we would need to use the full form of the log-likelihood function, including the $-N \ln \left (\sqrt{2 \pi} \sigma \right)$ term.
-2. that in order to maximize the likelihood function we want to *minimize* the $\chi^2$ function.
-3. that constants of proportionality (e.g., the $1/2$ in front of the $\chi^2$) can matter when combining likelihood functions with prior distributions for Bayesian parameter inference. We'll have more to say on this in a second when we talk about regularizers and their strengths.
-
-To be specific, $\chi^2$ is not the end of the story when we'd like to perform Bayesian parameter inference. To do so, we need the posterior probability distribution of the model parameters given the dataset, $p(\boldsymbol{\theta}|\,\boldsymbol{Y})$. We can calculate this quantity using Bayes rule
-
-$$
-p(\boldsymbol{\theta}|\,\boldsymbol{Y}) = \frac{p(\boldsymbol{Y}|\,\boldsymbol{\theta})\, p(\boldsymbol{\theta})}{p(\boldsymbol{Y})}
-$$
-
-The denominator is a constant so long as the model specification remains the same, leaving
-
-$$
-p(\boldsymbol{\theta}|\,\boldsymbol{Y}) \propto p(\boldsymbol{Y}|\,\boldsymbol{\theta})\, p(\boldsymbol{\theta}).
-$$
-
-So we need a prior probability distribution $p(\boldsymbol{\theta})$ in addition to the likelihood function to calculate the posterior probability distribution of the model parameters. Analogous to the maximum likelihood estimate, there is also the *maximum a posteriori* estimate (or MAP), which includes the effect of the prior probability distribution.
-
-:::{seealso}
-Useful resources on Bayesian inference include
-
-- [Data Analysis: A Bayesian Tutorial](https://www.amazon.com/Data-Analysis-Bayesian-Devinderjit-Sivia/dp/0198568320) by Sivia and Skilling
-- [Data analysis recipes: Fitting a model to data](https://ui.adsabs.harvard.edu/abs/2010arXiv1008.4686H/abstract) by Hogg, Bovy, and Lang
-- [Data analysis recipes: Probability calculus for inference](https://ui.adsabs.harvard.edu/abs/2012arXiv1205.4446H/abstract) by Hogg. Both this and the previous Hogg et al. document contain useful descriptions of what a forward or "generative" model is.
-:::
-
-## Data in the Fourier domain
-
-MPoL is a package to make images from interferometric data. Currently, we are most focused on modeling datasets from radio interferometers like the [Atacama Large Millimeter Array](https://almascience.nrao.edu/) (ALMA), so the following introduction will have a radio astronomy flavor to it. But the concept of forward modeling interferometric data is quite general, and with a few additions the MPoL package could be applied to imaging problems involving Fourier data from optical and infrared telescopes (if this describes your dataset, please get in touch).
-
-As astronomers, we are most often interested in characterizing what an astrophysical source looks like. In other words, how its surface brightness $I$ changes as a function of sky position. However, intereferometers acquire samples of data in the Fourier domain, also called the visibility domain. The visibility domain is the Fourier transform of the image sky brightness
-
-$$
-{\cal V}(u,v) = \iint I(l,m) \exp \left \{- 2 \pi i (ul + vm) \right \} \, \mathrm{d}l\,\mathrm{d}m,
-$$
-
-where $l$ and $m$ are direction cosines (roughly equivalent to R.A. and Dec) which parameterize the surface brightness distribution of the image $I(l,m)$, and $u$ and $v$ are spatial frequencies which parameterize the visibility function $\cal{V}(u,v)$. For more information on the meaning of these units, see {ref}`units-conventions-label`.
-
-The visibility function is complex-valued, and each measurement of it (denoted by $V_i$) is made in the presence of noise
-
-$$
-V_i = \mathcal{V}(u_i, v_i) + \epsilon.
-$$
-
-Here $\epsilon$ represents a noise realization from a [complex normal](https://en.wikipedia.org/wiki/Complex_normal_distribution) (Gaussian) distribution. Thankfully, most interferometric datasets do not exhibit significant covariance between the real and imaginary noise components, so we could equivalently say that the real and imaginary components of the noise are separately generated by draws from normal distributions characterized by standard deviation $\sigma$
-
-$$
-\epsilon_\Re \sim \mathcal{N}(0, \sigma) \\
-\epsilon_\Im \sim \mathcal{N}(0, \sigma)
-$$
-
-and
-
-$$
-\epsilon = \epsilon_\Re + i \epsilon_\Im
-$$
-
-Radio interferometers will commonly represent the uncertainty on each visibility measurement by a "weight" $w_i$, where
-
-$$
-w_i = \frac{1}{\sigma_i^2}
-$$
-
-A full interferometric dataset is a collection of visibility measurements, which we represent by
-
-$$
-\boldsymbol{V} = \{V_1, V_2, \ldots \}_{i=1}^N
-$$
-
-For reference, a typical ALMA dataset might contain a half-million individual visibility samples, acquired over a range of spatial frequencies.
-
-:::{seealso}
-A full introduction to Fourier transforms, radio astronomy, and interferometry is beyond the scope of this introduction. However, here are some additional resources that we recommend checking out.
-
-- [Essential radio astronomy](https://www.cv.nrao.edu/~sransom/web/xxx.html) textbook by James Condon and Scott Ransom, and in particular, Chapter 3.7 on Radio Interferometry.
-- NRAO's [17th Synthesis Imaging Workshop](http://www.cvent.com/events/virtual-17th-synthesis-imaging-workshop/agenda-0d59eb6cd1474978bce811194b2ff961.aspx) recorded lectures and slides available
-- [Interferometry and Synthesis in Radio Astronomy](https://ui.adsabs.harvard.edu/abs/2017isra.book.....T/abstract) by Thompson, Moran, and Swenson. An excellent and comprehensive reference on all things interferometry.
-- NJIT's online course materials for [Radio Astronomy](https://web.njit.edu/~gary/728/)
-:::
-
-## Likelihood functions for Fourier data
-
-Now that we've introduced likelihood functions in general and the specifics of Fourier data, let's talk about likelihood functions for inference with Fourier data. As before, our statement about the data generating process
-
-$$
-V_i = \mathcal{V}(u_i, v_i) + \epsilon
-$$
-
-leads us to the formulation of the likelihood function.
-
-First, let's assume we have some model that we'd like to fit to our dataset. To be a forward model, it should be able to predict the value of the visibility function for any spatial frequency, i.e., we need to be able to calculate $\mathcal{V}(u, v) = M_\mathcal{V}(u, v |, \boldsymbol{\theta})$.
-
-It's difficult to reason about all but the simplest models directly in the Fourier plane, so usually models are constructed in the image plane $M_I(l,m |,\boldsymbol{\theta})$ and then Fourier transformed (either analytically, or via the FFT) to construct visibility models $M_\mathcal{V}(u, v |, \boldsymbol{\theta}) \leftrightharpoons M_I(l,m |,\boldsymbol{\theta})$. For example, these models could be channel maps of carbon monoxide emission from a rotating protoplanetary disk (as in [Czekala et al. 2015](https://ui.adsabs.harvard.edu/abs/2015ApJ...806..154C/abstract), where $\boldsymbol{\theta}$ contains parameters setting the structure of the disk), or rings of continuum emission from a protoplanetary disk (as in [Guzmán et al. 2018](https://ui.adsabs.harvard.edu/abs/2018ApJ...869L..48G/abstract), where $\boldsymbol{\theta}$ contains parameters setting the sizes and locations of the rings).
-
-Following the discussion about how the complex noise realization $\epsilon$ is generated, this leads to a log likelihood function
-
-$$
-\ln \mathcal{L}(\boldsymbol{\theta}; \boldsymbol{V}) = - \frac{1}{2} \chi^2(\boldsymbol{\theta}; \boldsymbol{V}) + C
-$$
-
-Because the data and model are complex-valued, $\chi^2$ is evaluated as
-
-$$
-\chi^2(\boldsymbol{\theta}; \boldsymbol{V}) = \sum_i^N \frac{|V_i - M_\mathcal{V}(u_i, v_i |\,\boldsymbol{\theta})|^2}{\sigma_i^2}
-$$
-
-where $| |$ denotes the modulus squared. Equivalently, the calculation can be broken up into sums over the real ($\Re$) and imaginary ($\Im$) components of the visibility data and model
-
-$$
-\chi^2(\boldsymbol{\theta}; \boldsymbol{V}) = \sum_i^N \frac{(V_{\Re,i} - M_\mathcal{V,\Re}(u_i, v_i |\,\boldsymbol{\theta}))^2}{\sigma_i^2} + \sum_i^N \frac{(V_{\Im,i} - M_\mathcal{V,\Im}(u_i, v_i |\,\boldsymbol{\theta}))^2}{\sigma_i^2}
-$$
-
-Now with the likelihood function specified, we can add prior probability distributions $p(\boldsymbol{\theta})$, and calculate and explore the posterior probability distribution of the model parameters using algorithms like Markov Chain Monte Carlo. In this type of Bayesian inference, we're usually using forward models constructed with a small to medium number of parameters (e.g., 10 - 30), like in the protoplanetary disk examples of [Czekala et al. 2015](https://ui.adsabs.harvard.edu/abs/2015ApJ...806..154C/abstract) or [Guzmán et al. 2018](https://ui.adsabs.harvard.edu/abs/2018ApJ...869L..48G/abstract).
-
-:::{note}
-Even though we would say that "traditional" Bayesian parameter inference is not the main focus of RML algorithms, it is entirely [possible with the MPoL package](https://github.com/MPoL-dev/MPoL/issues/33). In fact, the gradient-based nature of the MPoL package (discussed in a moment) can make posterior exploration very fast using efficient Hamiltonian Monte Carlo samplers.
-:::
-
-:::{note}
-The $\chi^2$ likelihood function as formulated above is appropriate for visibilities with minimal spectral covariance. When modeling spectral line datasets, in particular those that have not been channel-averaged and retain the spectral response function from their Hann windowing, this covariance must be taken into account in the likelihood function. More information on how to derive these covariance matrices is provided in the appendices of [Loomis et al. 2018](https://ui.adsabs.harvard.edu/abs/2018AJ....155..182L/abstract) and will be detailed in forthcoming tutorials.
-:::
-
-## RML images as non-parametric models
-
-Now that we've introduced what it means to forward-model a dataset and how to calculate a likelihood function, let's talk about non-parametric models.
-
-Say that our $\boldsymbol{X} = \{x_1, x_2, \ldots\, x_N\}$ and $\boldsymbol{Y} = \{y_1, y_2, \ldots\, y_N\}$ dataset looked a bit more structured than a simple $y = mx + b$ relationship. We could expand the model by adding more parameters, for example, by adding quadratic and cubic terms, e.g., $y = a_0 + a_1 x + a_2 x^2 + a_3 x^3$. This would be a reasonable approach, especially if the parameters $a_2$, $a_3$, etc... had physical meaning. But if all that we're interested in is modeling the relationship between $y = f(x)$ in order to make predictions, we could just as easily use a [non-parametric model](https://www.section.io/engineering-education/parametric-vs-nonparametric/), like a [spline](<https://en.wikipedia.org/wiki/Spline_(mathematics)>) or a [Gaussian process](https://distill.pub/2019/visual-exploration-gaussian-processes/).
-
-With RML imaging, we're trying to come up with a model that will fit the dataset. But rather than using a parametric model like a protoplanetary disk structure model or a series of Gaussian rings, we're using a non-parametric model of *the image itself*. This could be as simple as parameterizing the image using the intensity values of the pixels themselves, i.e.,
-
-$$
-\boldsymbol{\theta} = \{I_1, I_2, \ldots, I_{N^2} \}
-$$
-
-assuming we have an $N \times N$ image.
-
-:::{note}
-RML imaging is different from CLEAN imaging, which operates as a deconvolution procedure in the image plane. At least at sub-mm and radio wavelengths, CLEAN is by far the dominant algorithm used to synthesize images from interferometric data. Therefore, if you're interested in RML imaging, it's worth first understanding the basics of the CLEAN algorithm.
-
-Here are some useful resources on the CLEAN algorithm.
-
-- [Interferometry and Synthesis in Radio Astronomy](https://ui.adsabs.harvard.edu/abs/2017isra.book.....T/abstract) Chapter 11.1
-- [CASA documentation on tclean](https://casa.nrao.edu/casadocs-devel/stable/imaging/synthesis-imaging)
-- David Wilner's lecture on [Imaging and Deconvolution in Radio Astronomy](https://www.youtube.com/watch?v=mRUZ9eckHZg)
-- For a discussion on using both CLEAN and RML techniques to robustly interpret kinematic data of protoplanetary disks, see Section 3 of [Visualizing the Kinematics of Planet Formation](https://ui.adsabs.harvard.edu/abs/2020arXiv200904345D/abstract) by The Disk Dynamics Collaboration
-:::
-
-A flexible image model for RML imaging is mostly analogous to using a spline or Gaussian process to fit a series of $\boldsymbol{X} = \{x_1, x_2, \ldots\, x_N\}$ and $\boldsymbol{Y} = \{y_1, y_2, \ldots\, y_N\}$ points---the model will nearly always have enough flexibility to capture the structure that exists in the dataset. The most straightforward formulation of a non-parametric image model is the pixel basis set, but we could also use more sophisticated basis sets like a set of wavelet coefficients, or even more exotic basis sets constructed from trained neural networks. These may have some serious advantages when it comes to the "regularizing" part of "regularized maximum likelihood" imaging. But first, let's talk about the "maximum likelihood" part.
-
-Given some image parameterization (e.g., a pixel basis set of $N \times N$ pixels, with each pixel `cell_size` in width), we would like to find the maximum likelihood image $\boldsymbol{\theta}_\mathrm{MLE}$. Fortunately, because the Fourier transform is a linear operation, we can analytically calculate the maximum solution (the same way we might find the best-fit slope and intercept for the line example). This maximum likelihood solution is called (in the radio astronomy world) the dirty image, and its associated point spread function is called the dirty beam.
-
-In the construction of the dirty image, all unsampled spatial frequencies are set to zero power. This means that the dirty image will only contain spatial frequencies about which we have at least some data. This assumption, however, rarely translates into good image fidelity, especially if there are many unsampled spatial frequencies which carry significant power. It's also important to recognize that dirty image is only *one* out of a set of *many* images that could maximize the likelihood function. From the perspective of the likelihood calculation, we could modify the unsampled spatial frequencies of the dirty image to whatever power we might like, and, because they are *unsampled*, the value of the likelihood calculation won't change, i.e., it will still remain maximal.
-
-When synthesis imaging is described as an "ill-posed inverse problem," this is what is meant. There is a (potentially infinite) range of images that could *exactly* fit the dataset, and without additional information we have no way of discriminating which is best. As you might suspect, this is now where the "regularization" part of "regularized maximum likelihood" imaging comes in.
-
-There are a number of different ways to talk about regularization. If one wants to be Bayesian about it, one would talk about specifying *priors*, i.e., we introduce terms like $p(\boldsymbol{\theta})$ such that we might calculate the maximum a posteriori (MAP) image $\boldsymbol{\theta}_\mathrm{MAP}$ using the posterior probability distribution
-
-$$
-p(\boldsymbol{\theta} |\, \boldsymbol{V}) \propto \mathcal{L}(\boldsymbol{\theta}; \boldsymbol{V}) \, p(\boldsymbol{\theta}).
-$$
-
-For computational reasons related to numerical over/underflow, we would most likely use the logarithm of the posterior probability distribution
-
-$$
-\ln p(\boldsymbol{\theta} |\, \boldsymbol{V}) \propto \ln \mathcal{L}( \boldsymbol{\theta}; \boldsymbol{V}) + \ln p(\boldsymbol{\theta}).
-$$
-
-One could accomplish the same goal without necessarily invoking the Bayesian language by simply talking about which parameters $\boldsymbol{\theta}$ optimize some objective function.
-
-We'll adopt the perspective that we have some objective "cost" function that we'd like to *minimize* to obtain the optimal parameters $\hat{\boldsymbol{\theta}}$. The machine learning community calls this a "loss" function $L(\boldsymbol{\theta})$, and so we'll borrow that terminology here. For an unregularized fit, an acceptable loss function is just the negative log likelihood ("nll") term,
-
-$$
-L(\boldsymbol{\theta}) = L_\mathrm{nll}(\boldsymbol{\theta}) = - \ln \mathcal{L}(\boldsymbol{\theta}; \boldsymbol{V}) = \frac{1}{2} \chi^2(\boldsymbol{\theta}; \boldsymbol{V})
-$$
-
-If we're only interested in $\hat{\boldsymbol{\theta}}$, it doesn't matter whether we include the $1/2$ prefactor in front of $\chi^2$, the loss function will still have the same optimum. However, when it comes time to add additional terms to the loss function, these prefactors matter in controlling the relative strength of each term.
-
-When phrased in the terminology of function optimization, additional terms can be described as regularization penalties. To be specific, let's add a term that regularizes the sparsity of an image.
-
-$$
-L_\mathrm{sparsity}(\boldsymbol{\theta}) = \sum_i |I_i|
-$$
-
-This prior is described in more detail in the {func}`API documentation <api.html#mpol.losses.sparsity>`. In short, the L1 norm promotes sparse solutions (solutions where many pixel values are zero). The combination of these two terms leads to a new loss function
-
-$$
-L(\boldsymbol{\theta}) = L_\mathrm{nll}(\boldsymbol{\theta}) + \lambda_\mathrm{sparsity} L_\mathrm{sparsity}(\boldsymbol{\theta})
-$$
-
-Where we control the relative "strength" of the regularization via the scalar prefactor $\lambda_\mathrm{sparsity}$. If $\lambda_\mathrm{sparsity} = 0$, no sparsity regularization is applied. Non-zero values of $\lambda_\mathrm{sparsity}$ will add in regularization that penalizes non-sparse $\boldsymbol{\theta}$ values. How strong this penalization is depends on the strength relative to the other terms in the loss calculation. [^relative-strength]
-
-We can equivalently specify this using Bayesian terminology, such that
-
-$$
-p(\boldsymbol{\theta} |\,\boldsymbol{V}) = \mathcal{L}(\boldsymbol{\theta}; \boldsymbol{V}) \, p(\boldsymbol{\theta})
-$$
-
-where
-
-$$
-p(\boldsymbol{\theta}) = C \exp \left (-\lambda_\mathrm{sparsity} \sum_i | I_i| \right)
-$$
-
-and $C$ is a normalization factor. When working with the logarithm of the posterior, this constant term is irrelevant.
-
-:::{seealso}
-That's RML imaging in a nutshell, but we've barely scratched the surface. We highly recommend checking out the following excellent resources.
-
-- The fourth paper in the 2019 [Event Horizon Telescope Collaboration series](https://ui.adsabs.harvard.edu/abs/2019ApJ...875L...4E/abstract) describing the imaging principles
-- [Maximum entropy image restoration in astronomy](https://ui.adsabs.harvard.edu/abs/1986ARA%26A..24..127N/abstract) AR&A by Narayan and Nityananda 1986
-- [Multi-GPU maximum entropy image synthesis for radio astronomy](https://ui.adsabs.harvard.edu/abs/2018A%26C....22...16C/abstract) by Cárcamo et al. 2018
-:::
 
 ## The MPoL package for Regularized Maximum Likelihood imaging
 
@@ -300,6 +41,26 @@ There are a few things about  MPoL that we believe make it an appealing platform
 
 To get started with MPoL, we recommend [installing the package](installation.md) and reading through the tutorial series. If you have any questions about the package, we invite you to join us on our [Github discussions page](https://github.com/MPoL-dev/MPoL/discussions).
 
+:::{seealso}
+That's RML imaging in a nutshell, but we've barely scratched the surface. We highly recommend checking out the following excellent resources.
+
+- The fourth paper in the 2019 [Event Horizon Telescope Collaboration series](https://ui.adsabs.harvard.edu/abs/2019ApJ...875L...4E/abstract) describing the imaging principles
+- [Maximum entropy image restoration in astronomy](https://ui.adsabs.harvard.edu/abs/1986ARA%26A..24..127N/abstract) AR&A by Narayan and Nityananda 1986
+- [Multi-GPU maximum entropy image synthesis for radio astronomy](https://ui.adsabs.harvard.edu/abs/2018A%26C....22...16C/abstract) by Cárcamo et al. 2018
+:::
+
+
+
+```{admonition} CLEAN
+RML imaging is different from CLEAN imaging, which operates as a deconvolution procedure in the image plane. CLEAN is by far the dominant algorithm used to synthesize images from interferometric data at sub-mm and radio wavelengths. 
+
+- [Interferometry and Synthesis in Radio Astronomy](https://ui.adsabs.harvard.edu/abs/2017isra.book.....T/abstract) Chapter 11.1
+- [CASA documentation on tclean](https://casa.nrao.edu/casadocs-devel/stable/imaging/synthesis-imaging)
+- David Wilner's lecture on [Imaging and Deconvolution in Radio Astronomy](https://www.youtube.com/watch?v=mRUZ9eckHZg)
+- For a discussion on using both CLEAN and RML techniques to robustly interpret kinematic data of protoplanetary disks, see Section 3 of [Visualizing the Kinematics of Planet Formation](https://ui.adsabs.harvard.edu/abs/2020arXiv200904345D/abstract) by The Disk Dynamics Collaboration
+```
+
+
 ```{rubric} Footnotes
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
index c9ba4508..0fcb9f3f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,7 @@ dev = [
     "tensorboard",
     "mypy",
     "frank>=1.2.1",
-    "sphinx>=5.3.0",
+    "sphinx>=7.2.0",
     "sphinx-autodoc2",
     "jupytext",
     "ipython!=8.7.0", # broken version for syntax higlight https://github.com/spatialaudio/nbsphinx/issues/687
diff --git a/src/mpol/losses.py b/src/mpol/losses.py
index c27bdef5..5ceb7257 100644
--- a/src/mpol/losses.py
+++ b/src/mpol/losses.py
@@ -1,30 +1,3 @@
-r"""
-Many loss function definitions follow those in Appendix A of 
-`EHT-IV 2019 <https://ui.adsabs.harvard.edu/abs/2019ApJ...875L...4E/abstract>`_, 
-including the regularization strength, which aspires to be similar across all terms, 
-providing at least a starting point for tuning multiple loss functions.
-
-
-We expect you might use the loss functions in the following scenarios:
-
-* optimizing a model with fixed data amplitudes and weights:
-    this is the most common 
-    scenario for RML imaging. Use :meth:`~mpol.losses.r_chi_squared` for loose 
-    visibilities and :meth:`~mpol.losses.r_chi_squared_gridded` for gridded quantities 
-    (e.g., :class:`~mpol.datasets.GriddedDataset`).
-* optimizing a model with potential adjustments to data amplitudes and weights: 
-    you will need to do this in a self-calibration workflow. Use 
-    :meth:`~mpol.losses.neg_log_likelihood_avg` for loose visibilities. The nature of 
-    the adjustments makes gridding inefficient, so there is no gridded version of this 
-    loss.
-* doing Bayesian inference: 
-    if you are calculating a log likelihood to use with MCMC 
-    sampling, for example, use :meth:`~mpol.losses.log_likelihood` or 
-    :meth:`~mpol.losses.log_likelihood_gridded`. (Pyro defines its own likelihood 
-    function through the probabilistic programming language, so this is not needed in 
-    that case).
-"""
-
 import numpy as np
 import torch
 
@@ -73,22 +46,21 @@ def r_chi_squared(
     model_vis: torch.Tensor, data_vis: torch.Tensor, weight: torch.Tensor
 ) -> torch.Tensor:
     r"""
-    Calculate the reduced :math:`\chi^2_\mathrm{r}` between the complex data
+    Calculate the reduced :math:`\chi^2_\mathrm{R}` between the complex data
     :math:`\boldsymbol{V}` and model :math:`M` visibilities using
 
     .. math::
 
-        \chi^2_\mathrm{r} = \frac{1}{2 N} \chi^2(\boldsymbol{\theta};\,\boldsymbol{V})
+        \chi^2_\mathrm{R} = \frac{1}{2 N} \chi^2(\boldsymbol{\theta};\,\boldsymbol{V})
 
-    where :math:`\chi^2` is evaluated using :func:`mpol.losses.chi_squared`.
+    where :math:`\chi^2` is evaluated using private function :func:`mpol.losses._chi_squared`.
     Data and model visibilities may be any shape as long as all tensors (including
     weight) have the same shape. Following `EHT-IV 2019
     <https://ui.adsabs.harvard.edu/abs/2019ApJ...875L...4E/abstract>`_, we apply
     a prefactor :math:`1/(2 N)`, where :math:`N` is the number of visibilities. The
     factor of 2 comes in because we must count real and imaginaries in the
-    :math:`\chi^2` sum. This means that this normalized negative log likelihood loss
-    function will have a minimum value of
-    :math:`\chi^2_\mathrm{r}(\hat{\boldsymbol{\theta}};\,\boldsymbol{V})
+    :math:`\chi^2` sum. This loss function will have a minimum value of
+    :math:`\chi^2_\mathrm{R}(\hat{\boldsymbol{\theta}};\,\boldsymbol{V})
     \approx 1` for a well-fit model (regardless of the number of data points), making
     it easier to set the prefactor strengths of other regularizers *relative* to this
     value.
@@ -97,7 +69,7 @@ def r_chi_squared(
     situation `and` where you are not adjusting the weight or the amplitudes of
     the data values. If it is used in any situation where uncertainties on parameter values
     are determined (such as Markov Chain Monte Carlo), it will return the wrong answer.
-    This is because the relative scaling of :math:`\chi^2_\mathrm{r}` with respect to
+    This is because the relative scaling of :math:`\chi^2_\mathrm{R}` with respect to
     parameter value is incorrect. For those applications, you should use
     :meth:`mpol.losses.log_likelihood`.
 
@@ -113,7 +85,7 @@ def r_chi_squared(
     Returns
     -------
     :class:`torch.Tensor` of :class:`torch.double`
-        the :math:`\chi^2_\mathrm{r}`, summed over all dimensions of input array.
+        the :math:`\chi^2_\mathrm{R}`, summed over all dimensions of input array.
     """
 
     # If model and data are multidimensional, then flatten them to get full N
@@ -127,7 +99,7 @@ def r_chi_squared_gridded(
 ) -> torch.Tensor:
     r"""
 
-    Calculate the reduced :math:`\chi^2_\mathrm{r}` between the complex data
+    Calculate the reduced :math:`\chi^2_\mathrm{R}` between the complex data
     :math:`\boldsymbol{V}` and model :math:`M` visibilities using gridded quantities.
     Function will return the same value regardless of whether Hermitian pairs are
     included.
@@ -145,7 +117,7 @@ def r_chi_squared_gridded(
     Returns
     -------
     :class:`torch.Tensor` of :class:`torch.double`
-        the :math:`\chi^2_\mathrm{r}` value summed over all input dimensions
+        the :math:`\chi^2_\mathrm{R}` value summed over all input dimensions
     """
     model_vis = griddedDataset(modelVisibilityCube)
 
@@ -168,7 +140,7 @@ def log_likelihood(
         \frac{1}{2} \chi^2(\boldsymbol{\theta};\,\boldsymbol{V})
 
     where :math:`N` is the number of complex visibilities and :math:`\chi^2` is
-    evaluated using :func:`mpol.losses.chi_squared`. Note that this expression has
+    evaluated internally using :func:`mpol.losses._chi_squared`. Note that this expression has
     factors of 2 in different places compared to the multivariate Normal you might be
     used to seeing because the visibilities are complex-valued. We could alternatively
     write
@@ -258,7 +230,7 @@ def neg_log_likelihood_avg(
 
     .. math::
 
-        - \frac{1}{2 N} \ln \mathcal{L}(\boldsymbol{\theta};\,\boldsymbol{V})
+        L = - \frac{1}{2 N} \ln \mathcal{L}(\boldsymbol{\theta};\,\boldsymbol{V})
 
     where :math:`N` is the number of complex visibilities. This loss function is most
     useful where you are in an optimization or point estimate