-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathStatistics.tex
116 lines (102 loc) · 6.1 KB
/
Statistics.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
% \begin{savequote}[75mm]
% Nulla facilisi. In vel sem. Morbi id urna in diam dignissim feugiat. Proin molestie tortor eu velit. Aliquam erat volutpat. Nullam ultrices, diam tempus vulputate egestas, eros pede varius leo.
% \qauthor{Quoteauthor Lastname}
% \end{savequote}
\chapter{Statistics}
Statistics is a branch of mathematics dealing with the collection, organization, analysis, interpretation and presentation of data.
\href{https://en.wikipedia.org/wiki/Statistics}{wikipedia}
\section{Probability}
TODO: Probability (general + simple), CDF, Variance, Markov-Property, etc.
\subsection{$L_p$-Space for Random-Variables}
The $L_p$-Norm for Random-Variables $X$, where $\mathbb{E}|X|^p < \infty$, is defined through:
\begin{align*}
||X||_p:=(\mathbb{E}[|X|^p])^{\frac{1}{p}}
\end{align*}
\href{http://www2.stat.duke.edu/courses/Fall18/sta711/lec/wk-05.pdf}{lecture}
\subsection{Jensens-Inequality for Random Variables}
If $\phi$ is a konvex function and $X$ a Random-Variable, then
\begin{align*}
\phi(\mathbb{E}X) \leq \mathbb{E}\phi(X)
\end{align*}
\href{https://en.wikipedia.org/wiki/Jensen%27s_inequality}{wikipedia}
\subsection{Fisher-Information}
For the parametric family $\mathcal{P} \in \{ \mathcal{P}_\theta | \theta \in \Theta_L\}$...TODO\\
If we assume $p_\theta(x,y)=p_\theta(y|x)p(x)$, the Fisher-Information Matrix $\mathcal{I}(\theta)$ becomes:
\begin{align*}
\mathcal{I}(\theta) = \mathbb{E}_{(X,Y)\sim {P}_\theta}[\nabla_{\theta}\log p_{\theta}(Y|X)\otimes\nabla_{\theta}\log p_{\theta}(Y|X)]\text{,}
\end{align*}
where $\otimes$ is the inner-product.\\
\href{https://arxiv.org/abs/1711.01530}{paper}
\section{Distributions}
In this section, $X$ denotes a Random Vairable and $f$ the density-function.\\
TODO: More common distributions
\subsection{Normal Distribution}
If $X \sim {\mathcal {N}}(\mu ,\sigma ^{2})$ for ${\displaystyle \mu \in \mathbb {R}}$ and $\sigma ^{2} > 0 \in \mathbb {R}$, then:
\begin{align*}
f(x) &= {\displaystyle {\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}\\
\mathbb{E}X &= \mu \\
Var[X] &= \sigma^2
\end{align*}
\href{https://en.wikipedia.org/wiki/Normal_distribution}{wikipedia}
\subsection{Normal Distribution (Multivariate)}
If $X \sim {\mathcal {N}}(\mu ,\Sigma)$ for $\mu \in \mathbb {R}^k$ and $\Sigma \in \mathbb {R}^{k \times k}$ with $\Sigma$ being positve semi-definite, then:
\begin{align*}
f(x) &= \operatorname {det} (2\pi {\boldsymbol {\Sigma }})^{-{\frac {1}{2}}}\,e^{-{\frac {1}{2}}(\mathbf {x} -{\boldsymbol {\mu }})'{\boldsymbol {\Sigma }}^{-1}(\mathbf {x} -{\boldsymbol {\mu }})}\\
\mathbb{E}X &= \mu \\
Var[X] &= \Sigma
\end{align*}
\href{https://en.wikipedia.org/wiki/Multivariate_normal_distribution}{wikipedia}
\subsection{Empirical Distribution}
For any observation $X'=(x'_1, \cdots, x'_n)$, the empirical distribution is defined as:
\begin{align*}
f(x) &= \hat{f}(x) =\frac{1}{n}\sum_{i=1}^{n}\delta(x - x_i)\text{, where $\delta$ is the dirac-delta function}\\
\mathbb{E}X &= \hat{\mathbb{E}}X = \frac{1}{n}\sum_{i=1}^{n}x_i \\
Var[X] &= \hat{Var}[X] =\frac{1}{n}\sum_{i=1}^{n}(x_i-\hat{\mathbb{E}}X)^2
\end{align*}
\href{http://www.stat.umn.edu/geyer/5102/slides/s1.pdf}{lecture}
\section{Estimation}
TODO: ML, Score-Function, biased/unbiased, Cramér–Rao bound, confidence-interval
\section{Divergences}
Conventions for this section: $P$ and $Q$ are probability measures over a set $X$, and $P$ is absolutely continuous with respect to $Q$. $S$ is a space of all probability distributions with common support.
\subsection{Divergence}
A divergence on $S$ is a function $D: S \times S \rightarrow R$ satisfying
\begin{enumerate}
\item $D(p || q) \geq 0 \forall p, q \in S$,
\item $D(p || q) = 0 \Leftrightarrow p = q$
\end{enumerate}
\textit{A divergence is a "sense" of distance between two probability distributions. It's not a metric, but a pre-metric.}\\
\href{https://en.wikipedia.org/wiki/Divergence_(statistics)}{wikipedia}
\subsection{f-Divergence}
\begin{enumerate}
\item Generalization of whole family of divergences
\item For a convex function $f$ such that $f(1) = 0$, the f-divergence of $P$ from $Q$ is defined as:\\
$D_{f}(P\parallel Q)\equiv \int _{{\Omega }}f\left({\frac{dP}{dQ}}\right)\,dQ$
\item \href{https://en.wikipedia.org/wiki/Divergence_(statistics)}{wikipedia}
\end{enumerate}
\subsection{KL-Divergence}
\begin{enumerate}
\item The Kullback–Leibler divergence from $Q$ to $P$ is defined as\\
$D_{\mathrm {KL} }(P\|Q)=\int _{X}\log {\frac {dP}{dQ}}\,dP=D_{t\log t}$.
\item maxmizing likelihood is equivalent to minimizing $D_{KL}(P(. \vert \theta^{\ast}) \, \Vert \, P(. \vert \theta))$ (the foreward-KL Divergence), where $P(. \vert \theta^{\ast})$ is the true distribution and $P(. \vert \theta)$ is our estimate.
\item \href{https://en.wikipedia.org/wiki/Kullback–Leibler_divergence}{wikipedia}
\item TODO: Fisher-Matrix infitesimal relationship
\end{enumerate}
\subsection{Jensen–Shannon divergence}
The Jensen–Shannon divergence from $Q$ to $P$ is defined as
\begin{align*}
{{\rm {JSD}}}(P\parallel Q)={\frac {1}{2}}D(P\parallel M)+{\frac {1}{2}}D(Q\parallel M)
\end{align*}, where $M={\frac {1}{2}}(P+Q)$\\
\href{https://en.wikipedia.org/wiki/Jensen–Shannon_divergence}{wikipedia}
\subsection{TODO: Wasserstein \& Wasserstein Dual}
\section{Information Geometry}
Information Geometry defines a Riemannian Manifold over probability distributions for statistical models.\\
\subsection{Fisher-Rao Metric}
For the parametric family $\mathcal{P} \in \{ \mathcal{P}_\theta | \theta \in \Theta_L\}$ and every $\alpha, \beta \in \mathbb{R}^d$ with their tangent-vectors $\bar{\alpha}=dp_{\theta + t\alpha}/dt|_{t=0}$ and $\bar{\beta}=dp_{\theta + t\beta}/dt|_{t=0}$, we define the inner local product as follows:
\begin{align*}
<\bar{\alpha}, \bar{\beta}> &:= \int_M\frac{\bar{\alpha}}{p_\theta}\frac{\bar{\beta}}{p_\theta}p_\theta\\
&=<\alpha, \mathcal{I}(\theta)\beta>\text{,}
\end{align*}
where $\mathcal{I}(\theta)$ is the Fisher-Information Matrix.
\subsection{Natural Gradient}
The natural gradient is the gradient descent induced by the Fisher-Rao geometry of $\{\mathcal{P}_\theta \}$.\\
\href{https://arxiv.org/abs/1711.01530}{paper}