Merge pull request #31 from aimalz/issue/23/histogram

Issue/23/histogram
aimalz · Dec 7, 2016 · 93d4249 · 93d4249
2 parents 0da3d83 + 4d30fc2
commit 93d4249
Show file tree

Hide file tree

Showing 2 changed files with 179 additions and 61 deletions.
diff --git a/docs/notebooks/demo.ipynb b/docs/notebooks/demo.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# `qp` Demo\n",
     "\n",
-    "In this notebook we use the `qp` module to approximate some standard 1-D PDFs using sets of quantiles, and assess the accuracy of the quantile parametrization(TM).\n",
+    "In this notebook we use the `qp` module to approximate some standard 1-D PDFs using sets of quantiles, and assess the accuracy of the quantile parametrization(TM) in comparison to other popular parametrizations.\n",
     "\n",
     "### Requirements\n",
     "\n",
@@ -79,7 +79,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now, let's compute a set of evenly spaced quantiles. These will be carried by the `PDF` object as `p.quantiles`.  We also demonstrate the initialization of a PDF object with quantiles and no truth function."
+    "Now, let's compute a set of evenly spaced quantiles. These will be carried by the `PDF` object as `p.quantiles`.  We also demonstrate the initialization of a `PDF` object with quantiles and no truth function."
    ]
   },
   {
@@ -98,7 +98,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's test interpolation at a single point within the quantile range."
+    "Let's also compute a histogram representation that will be carried by the `PDF` object as `p.histogram`.  We can similary initialize a `PDF` object with a histogram and no truth function."
    ]
   },
   {
@@ -109,7 +109,27 @@
    },
    "outputs": [],
    "source": [
-    "P.approximate(0.314)"
+    "T = qp.PDF(truth=dist)\n",
+    "histogram = T.histogramize(nbins=10, binrange=[-2., 2.])\n",
+    "R = qp.PDF(histogram=histogram)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's test interpolation at a single point for the quantile parametrization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "print(P.approximate(0.314, using='quantiles'))"
    ]
   },
   {
@@ -128,7 +148,7 @@
    "outputs": [],
    "source": [
     "grid = np.linspace(-3., 3., 100)\n",
-    "P.approximate(grid)"
+    "P.approximate(grid, using='quantiles')"
    ]
   },
   {
@@ -147,7 +167,8 @@
    "outputs": [],
    "source": [
     "bounds = (-3.0, 3.0)\n",
-    "P.plot(limits=bounds, points=grid)"
+    "P.plot(limits=bounds, points=grid)\n",
+    "T.plot(limits=bounds, points=grid)"
    ]
   },
   {
@@ -163,16 +184,21 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "scrolled": false
    },
    "outputs": [],
    "source": [
-    "D1 = qp.utils.calculate_kl_divergence(P, Q, limits=(-1.,1.))\n",
-    "D0 = qp.utils.calculate_kl_divergence(P, Q, limits=(min(quantiles), max(quantiles)))\n",
-    "D2 = qp.utils.calculate_kl_divergence(P, Q, limits=(-2.,2.))\n",
-    "D3 = qp.utils.calculate_kl_divergence(P, Q, limits=(-3.,3.))\n",
+    "qD1 = qp.utils.calculate_kl_divergence(P, Q, limits=(-1.,1.))\n",
+    "qD2 = qp.utils.calculate_kl_divergence(P, Q, limits=(-2.,2.))\n",
+    "qD3 = qp.utils.calculate_kl_divergence(P, Q, limits=(-3.,3.))\n",
     "\n",
-    "print(D1, D0, D2, D3)"
+    "hD1 = qp.utils.calculate_kl_divergence(T, R, limits=(-1.,1.))\n",
+    "hD2 = qp.utils.calculate_kl_divergence(T, R, limits=(-2.,2.))\n",
+    "hD3 = qp.utils.calculate_kl_divergence(T, R, limits=(-3.,3.))\n",
+    "\n",
+    "print(qD1, qD2, qD3)\n",
+    "print(hD1, hD2, hD3)"
    ]
   },
   {
@@ -190,12 +216,16 @@
    },
    "outputs": [],
    "source": [
-    "RMS1 = qp.utils.calculate_rms(P, Q, limits=(-1.,1.))\n",
-    "RMS0 = qp.utils.calculate_rms(P, Q, limits=(min(quantiles), max(quantiles)))\n",
-    "RMS2 = qp.utils.calculate_rms(P, Q, limits=(-2.,2.))\n",
-    "RMS3 = qp.utils.calculate_rms(P, Q, limits=(-3.,3.))\n",
+    "qRMS1 = qp.utils.calculate_rms(P, Q, limits=(-1.,1.))\n",
+    "qRMS2 = qp.utils.calculate_rms(P, Q, limits=(-2.,2.))\n",
+    "qRMS3 = qp.utils.calculate_rms(P, Q, limits=(-3.,3.))\n",
+    "\n",
+    "hRMS1 = qp.utils.calculate_rms(T, R, limits=(-1.,1.))\n",
+    "hRMS2 = qp.utils.calculate_rms(T, R, limits=(-2.,2.))\n",
+    "hRMS3 = qp.utils.calculate_rms(T, R, limits=(-3.,3.))\n",
     "\n",
-    "print(RMS1, RMS0, RMS2, RMS3)"
+    "print(qRMS1, qRMS2, qRMS3)\n",
+    "print(hRMS1, hRMS2, hRMS3)"
    ]
   },
   {

diff --git a/qp/pdf.py b/qp/pdf.py
@@ -4,25 +4,41 @@
 
 import qp
 
-
+<<<<<<< HEAD
 class PDF(object):
-    """
-    A PDF object, with some representation of a distribution.
-
-    Parameters
-    ----------
-    truth: scipy.stats.rv_continuous object, optional
-        continuous, parametric form of the PDF
-    quantiles: ndarray, optional
-        array of quantile values separated by
-    vb: boolean
-        report on progress to stdout?
-    """
-    def __init__(self, truth=None, quantiles=None, vb=True):
+
+    def __init__(self, truth=None, quantiles=None, histogram=None,
+                 vb=True):
+        """
+        An object representing a probability density function in
+        various ways.
+
+        Parameters
+        ----------
+        truth: scipy.stats.rv_continuous object, optional
+            Continuous, parametric form of the PDF
+        quantiles: ndarray, optional
+            Array of quantile values separated by 100./(len(quantiles)+1) percentiles
+        histogram: tuple of ndarrays, optional
+            Pair of arrays of lengths (nbins+1, nbins) containing endpoints of bins and values in bins
+        vb: boolean
+            report on progress to stdout?
+        """
         self.truth = truth
         self.quantiles = quantiles
-        # Should make this a proper exception rather than just printing an advisory notice
-        if vb and self.truth is None and self.quantiles is None:
+        self.histogram = histogram
+        self.initialized = None
+
+        if self.truth is not None:
+            self.initialized = 'truth'
+        elif self.quantiles is not None:
+            self.initialized = 'quantiles'
+        elif self.histogram is not None:
+            self.initialized = 'histogram'
+        self.last = self.initialized
+
+        if vb and self.truth is None and self.quantiles is None
+              and self.histogram is None:
             print 'Warning: initializing a PDF object without inputs'
         self.difs = None
         self.mids = None
@@ -32,7 +48,8 @@ def __init__(self, truth=None, quantiles=None, vb=True):
 
     def evaluate(self, loc, vb=True):
         """
-        Evaluates the PDF (either the true version, or an approximation of it) at given location(s).
+        Evaluates the PDF (either the true version, or the most recent
+        approximation of it) at the given location(s).
 
         Parameters
         ----------
@@ -53,11 +70,9 @@ def evaluate(self, loc, vb=True):
         if self.truth is not None:
             if vb: print('Evaluating the true distribution.')
             val = self.truth.pdf(loc)
-        elif self.quantiles is not None:
-            if vb: print('Evaluating an interpolation of the quantile distribution.')
-            val = self.approximate(loc)[1]
         else:
-            if vb: print('No representation provided for evaluation.')
+            if vb: print('Evaluating an interpolation of the '+self.last+' parametrization.')
+            val = self.approximate(loc, using=self.last)[1]
 
         return(val)
 
@@ -83,13 +98,13 @@ def integrate(self, limits):
 
     def quantize(self, percent=1., number=None, vb=True):
         """
-        Computes an array of evenly-spaced quantiles.
+        Computes an array of evenly-spaced quantiles from the truth.
 
         Parameters
         ----------
         percent: float
             the separation of the requested quantiles, in percent
-        num_points: int
+        number: int
             the number of quantiles to compute.
         vb: boolean
             report on progress to stdout?
@@ -123,16 +138,70 @@ def quantize(self, percent=1., number=None, vb=True):
             self.quantiles = self.truth.ppf(points)
         else:
             print('New quantiles can only be computed from a truth distribution in this version.')
+            return
 
         if vb: print("Result: ", self.quantiles)
+        self.last = 'quantiles'
         return self.quantiles
 
-    def interpolate(self, vb=True):
+    def histogramize(self, binends=None, nbins=10, binrange=[0., 1.], vb=True):
         """
-        Constructs an `interpolator` function based on the quantiles.
+        Computes the histogram values from the truth.
 
         Parameters
         ----------
+        binends: ndarray, float, optional
+            Array of N+1 endpoints of N bins
+        nbins: int, optional
+            Number of bins if no binends provided
+        range: tuple, float, optional
+            Pair of values of endpoints of total bin range
+        vb: boolean
+            Report on progress to stdout?
+
+        Returns
+        -------
+        self.histogram: tuple of ndarrays of floats
+            Pair of arrays of lengths (nbins+1, nbins) containing endpoints of bins and values in bins
+
+        Comments
+        --------
+        A histogram representation of a PDF is a popular approximate way to store it. This method computes some histogram bin heights from a truth distribution (other representations forthcoming)
+        and stores them in the `self.histogram` attribute.
+
+        Uses the `.cdf` method of the `rvs_continuous` distribution
+        object stored in `self.truth`. This calculates the CDF.
+        See `the Scipy docs <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rv_continuous.cdf.html#scipy.stats.rv_continuous.cdf>`_ for details.
+
+        """
+        if binends is None:
+            step = float(binrange[1]-binrange[0])/nbins
+            binends = np.arange(binrange[0], binrange[1]+step, step)
+
+        nbins = len(binends)-1
+        self.histogram = np.zeros(nbins)
+        if vb: print("Calculating histogram: ", binends)
+        if self.truth is not None:
+            cdf = self.truth.cdf(binends)
+            for b in range(nbins):
+                self.histogram[b] = (cdf[b+1]-cdf[b])/(binends[b+1]-binends[b])
+        else:
+            print('New histograms can only be computed from a truth distribution in this version.')
+            return
+
+        self.histogram = (binends, self.histogram)
+        if vb: print("Result: ", self.histogram[1])
+        self.last = 'histogram'
+        return self.histogram
+
+    def interpolate(self, using=None, vb=True):
+        """
+        Constructs an `interpolator` function based on the parametrization.
+
+        Parameters
+        ----------
+        using: string
+            Parametrization on which to interpolate, currently supports 'quantiles', 'histogram'
         vb: boolean
             report on progress to stdout?
 
@@ -142,31 +211,47 @@ def interpolate(self, vb=True):
 
         Notes
         -----
-        The `self.interpolator` object is a function, that is used by the `approximate` method.
+        The `self.interpolator` object is a function that is used by the `approximate` method.
         """
-        # First find the quantiles if none exist:
-        if self.quantiles is None:
-            self.quantiles = self.quantize()
+        if using == 'truth':
+            print('The truth needs no interpolation.  Try converting to an approximate parametrization first.')
+            return
 
-        self.difs = self.quantiles[1:]-self.quantiles[:-1]
-        self.mids = (self.quantiles[1:]+self.quantiles[:-1])/2.
-        self.quantvals = (1.0/(len(self.quantiles)+1))/self.difs
+        if using == 'quantiles':
+            # First find the quantiles if none exist:
+            if self.quantiles is None:
+                self.quantiles = self.quantize()
 
-        if vb: print("Creating interpolator")
-        self.interpolator = spi.interp1d(self.mids, self.quantvals, fill_value="extrapolate")
+            self.difs = self.quantiles[1:]-self.quantiles[:-1]
+            self.mids = (self.quantiles[1:]+self.quantiles[:-1])/2.
+            self.vals = (1.0/(len(self.quantiles)+1))/self.difs
+
+        if using == 'histogram':
+            # First find the histogram if none exists:
+            if self.histogram is None:
+                self.histogram = self.histogramize()
+
+            self.mids = (self.histogram[0][1:]+self.histogram[0][:-1])/2.
+            self.vals = self.histogram[1]
+
+        if vb: print("Creating interpolator for "+using+' parametrization.')
+        self.interpolator = spi.interp1d(self.mids, self.vals, fill_value="extrapolate")
 
         return
 
-    def approximate(self, points, vb=True):
+    def approximate(self, points, using=None, vb=True):
         """
-        Interpolates between the quantiles to get an approximation to the density.
+        Interpolates the parametrization to get an approximation to the density.
 
         Parameters
         ----------
         number: int
             the number of points over which to interpolate, bounded by the quantile value endpoints
         points: ndarray
             the value(s) at which to evaluate the interpolated function
+        using: string
+            approximation parametrization, currently either 'quantiles'
+            or 'histogram'
         vb: boolean
             report on progress to stdout?
 
@@ -186,9 +271,8 @@ def approximate(self, points, vb=True):
             x, y = p.approximate(np.linspace(-1., 1., 100))
         """
 
-        # First construct interpolator function if it does not already exist.
         if self.interpolator is None:
-            self.interpolate()
+            self.interpolate(using=using)
         interpolated = self.interpolator(points)
         interpolated[interpolated<0.] = 0.
 
@@ -216,12 +300,16 @@ def plot(self, limits, points=None):
             plt.plot(x, self.truth.pdf(x), color='k', linestyle='-', lw=1.0, alpha=1.0, label='True PDF')
 
         if self.quantiles is not None:
-            y = [0., 1.]
-            plt.vlines(self.quantiles, y[0], y[1], color='k', linestyle='--', lw=1.0, alpha=1., label='Quantiles')
-
-        if points is not None:
-            (grid, interpolated) = self.approximate(points)
-            plt.plot(grid, interpolated, color='r', linestyle=':', lw=2.0, alpha=1.0, label='Interpolated PDF')
+            plt.vlines(self.quantiles, 0., 1., color='b', linestyle='--', lw=1.0, alpha=1., label='Quantiles')
+            if points is not None:
+                (grid, qinterpolated) = self.approximate(points, using='quantiles')
+                plt.plot(grid, qinterpolated, color='b', linestyle=':', lw=2.0, alpha=1.0, label='Quantile Interpolated PDF')
+
+        if self.histogram is not None:
+            plt.hlines(self.histogram[1], self.histogram[0][:-1], self.histogram[0][1:], color='r', linestyle='--', lw=1.0, alpha=1., label='Histogram')
+            if points is not None:
+                (grid, hinterpolated) = self.approximate(points, using='histogram')
+                plt.plot(grid, hinterpolated, color='r', linestyle=':', lw=2.0, alpha=1.0, label='Histogram Interpolated PDF')
 
         plt.legend()
         plt.xlabel('x')