-
Notifications
You must be signed in to change notification settings - Fork 8
/
hypothesis-tests-reference.Rmd
291 lines (231 loc) · 8.09 KB
/
hypothesis-tests-reference.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
---
title: "hypothesis test formula sheet"
output: pdf_document
classoption: landscape
geometry: margin=1.5cm
header-includes:
\usepackage{float}
\usepackage{makecell}
---
\renewcommand{\arraystretch}{2.5}
### One sample tests
\begin{figure}[H]
\begin{center}
\begin{tabular}{lccccc}
name & known parameters & null hypothesis & test statistic & null distribution & p-value (two sided) \\
\hline
one sample z-test &
$\mu$ unknown, $\sigma^2$ known &
$H_0: \mu = \mu_0$ &
$Z_\text{obs} = \displaystyle {\bar x - \mu_0 \over {\sigma \over \sqrt n}}$ &
N(0, 1) &
$2 \cdot P(Z < -|Z_\text{obs}|)$ \\
\thead{one sample z-test \\ for a proportion} &
$\pi$ unknown &
$H_0: \pi = \pi_0$ &
$Z_\text{obs} = \displaystyle {\hat \pi - \pi_0 \over \sqrt{\pi_0 (1 - \pi_0) \over n}}$ &
N(0, 1) &
$2 \cdot P(Z < -|Z_\text{obs}|)$ \\
one sample t-test &
$\mu, \sigma^2$ unknown &
$H_0: \mu = \mu_0$ & $T_\text{obs} = \displaystyle {\bar x - \mu_0 \over {s \over \sqrt n}}$ &
$t_{n-1}$ &
$2 \cdot P(t_{n-1} < -|T_\text{obs}|)$ \\
chi-square test for variance &
$\mu, \sigma^2$ unknown &
$H_0: \sigma^2 = \sigma^2_0$ &
$T = \displaystyle {(n - 1) \cdot s^2 \over \sigma^2_0}$ &
$\chi^2_{n-1}$ &
TODO \\
sign test &
none &
$H_0: m = m_0$ &
$B_\text{obs} = \sum_i I_{x_i > m_0}$ &
Binomial($\sum_i I_{x_i \neq m_0}, \frac 12$) &
$2 \cdot \min(P(B \ge B_\text{obs}), P(B \le B_\text{obs}))$
\end{tabular}
\end{center}
\end{figure}
### Two sample tests
\begin{figure}[H]
\begin{center}
\begin{tabular}{lccccc}
name & known parameters & null hypothesis & test statistic & null distribution & p-value (two sided) \\
\hline
two sample z-test &
\thead{$\mu_1, \mu_2$ unknown \\ $\sigma_1^2, \sigma_2^2$ known} &
$H_0: \mu_1 - \mu_2 = \delta_0$ &
$Z_\text{obs} = \displaystyle {(\bar x - \bar y) - \delta_0 \over \sqrt{{\sigma_1^2 \over n_1} + {\sigma_2^2 \over n_2}}}$ &
N(0, 1) &
$2 \cdot P(Z < -|Z_\text{obs}|)$ \\
\thead{two sample z-test \\ for a proportion} &
$\pi_1, \pi_2$ unknown &
$H_0: \pi_1 = \pi_2$ &
$Z_\text{obs} = \displaystyle {\hat \pi_1 - \hat \pi_2 \over \sqrt{\hat \pi (1 - \hat \pi) \left({1 \over n_1} + {1 \over n_2} \right)}}$ &
N(0, 1) &
$2 \cdot P(Z < -|Z_\text{obs}|)$ \\
equal variance t-test &
\thead{$\mu_1, \mu_2$ \\ $\sigma_1^2 = \sigma_2^2$ unknown} &
$H_0: \mu_1 - \mu_2 = \delta_0$ &
$T_\text{obs} = \displaystyle {(\bar x - \bar y) - \delta_0 \over
s_\text{pooled} \sqrt{{1 \over n_1} + {1 \over n_2}}}$ &
$t_{n_1 + n_2 - 2}$ &
$2 \cdot P(t_{n_1 + n_2 - 2} < -|T_\text{obs}|)$ \\
welch's t-test &
\thead{$\mu_1, \mu_2$ \\ $\sigma_1^2 \neq \sigma_2^2$ unknown} &
$H_0: \mu_1 - \mu_2 = \delta_0$ &
$T_\text{obs} = \displaystyle {(\bar x - \bar y) - \delta_0 \over
\sqrt{{s_1^2 \over n_1} + {s_2^2 \over n_2}}}$ &
$t_\nu$ &
$2 \cdot P(t_\nu < -|T_\text{obs}|)$ \\
f-test for equal variance &
\thead{$\mu_1, \mu_2$ \\ $\sigma_1^2, \sigma_2^2$ unknown} &
$H_0: \sigma_1^2 = \sigma_2^2$ &
$F_\text{obs} = \displaystyle {s_1^2 \over s_2^2}$ &
$F_{n_1 - 1, n_2 - 1}$ &
TODO \\
\end{tabular}
\end{center}
\end{figure}
where
\begin{align}
s_\text{pooled} &= \sqrt{{(n_1 - 1) \cdot s_1^2 + (n_2 - 1) \cdot s_2^2 \over n_1 + n_2 - 2}}
\qquad \qquad
\nu = { \left({s_1^2 \over n_1} + {s_2^2 \over n_2}\right)^2 \over
{\left({s_1^2 \over n_1} \right)^2 \over n_1 - 1} +
{\left({s_2^2 \over n_2} \right)^2 \over n_2 - 1}
}
\qquad \qquad
\hat \pi = {n_1 \hat \pi_1 + n_2 \hat \pi_2 \over n_1 + n_2}
\end{align}
### Paired two-sample tests (incomplete, don't trust these yet)
Suppose $x_i$ is paired with $y_i$. Then let $d_i = x_i - y_i$.
\begin{figure}[H]
\begin{center}
\begin{tabular}{lccccc}
name & known parameters & null hypothesis & test statistic & null distribution & p-value (two sided) \\
\hline
paired z-test &
$\mu$ unknown, $\sigma^2$ known &
$H_0: \mu_1 - \mu_2 = \delta_0$ &
$Z_\text{obs} = \displaystyle {\bar d - \mu_0 \over {\sigma \over \sqrt n}}$ & N(0, 1) &
$2 \cdot P(Z < -|Z_\text{obs}|)$ \\
paired t-test &
$\mu, \sigma^2$ unknown &
$H_0: \mu_1 - \mu_2 = \delta_0$ &
$T_\text{obs} = \displaystyle {\bar d - \delta_0 \over {s_d \over \sqrt n}}$ & $t_{n-1}$ &
$2 \cdot P(t_{n-1} < -|T_\text{obs}|)$ \\
paired sign test &
none &
$H_0: m_1 - m_2 = \delta_0$ &
$B_\text{obs} = \sum_i I_{d_i > \delta_0}$ &
Binomial($\sum_i I_{d_i \neq \delta_0}, \frac 12$) &
$2 \cdot \min(P(B \ge B_\text{obs}), P(B \le B_\text{obs}))$
\end{tabular}
\end{center}
\end{figure}
### Related to linear regression and ANOVA
A one-way ANOVA table looks like
\begin{figure}[H]
\begin{center}
\begin{tabular}{lccccc}
source & df & SS & MS & F & p-value \\
\hline
treatment &
$k - 1$ &
$\text{SSTreatment}$ &
$\text{MSTreatment} = {\text{SSTreatment} \over k - 1}$ &
${\text{MSTreatment} \over \text{MSError}}$ &
$P(F_{k - 1, N - k} > F_\text{obs})$ \\
error &
$N - k$ &
$\text{SSError}$ &
$\text{MSError} = {\text{SSError} \over N - k}$ \\
total &
$N - 1$ &
$\text{SSTotal}$
\end{tabular}
\end{center}
\end{figure}
where
\begin{align}
\text{SSTreatment}
&= \sum_{i=1}^k \sum_{j=1}^{n_i} (\bar y_{i \cdot} - \bar y_{\cdot \cdot})^2
= \sum_{i=1}^k n_i (\bar y_{i \cdot} - \bar y_{\cdot \cdot})^2 \\
\text{SSError}
&= \sum_{i=1}^k \sum_{j=1}^{n_i} (y_{ij} - \bar y_{i \cdot})^2
= \sum_{i=1}^k (n_i - 1) s^2_i \\
\text{SSTotal}
&= \text{SSTreatment} + \text{SSError}
= \sum_{i=1}^k \sum_{j=1}^{n_i} (y_{ij} - \bar y_{\cdot \cdot})^2
\end{align}
We use this to compare group means (i.e. categorical predictor). When we have continuous predictors we use simple linear regression which results in the following:
\begin{figure}[H]
\begin{center}
\begin{tabular}{lccccc}
name & null hypothesis & test statistic & null distribution & p-value \\
\hline
t-test for $\beta_0$ &
$H_0: \beta_0 = c$ &
$T_\text{obs} = \displaystyle {\hat \beta_0 - c \over \hat \sigma \sqrt{\frac 1n + \bar x^2 / \sum_{i=1}^n (x_i - \bar x)^2 }}$ &
$t_{n-2}$ &
$2 \cdot P(T < -|T_\text{obs}|)$ \\
t-test for $\beta_1$ &
$H_0: \beta_1 = c$ &
$T_\text{obs} = \displaystyle {\hat \beta_1 - c \over \hat \sigma / \sqrt{\sum_{i=1}^n (x_i - \bar x)^2}}$ &
$t_{n-2}$ &
$2 \cdot P(T < -|T_\text{obs}|)$ \\
overall f-test for linear model &
TODO &
TODO &
TODO &
TODO \\
\end{tabular}
\end{center}
\end{figure}
Miscelleanous simple linear regression formulas not from hypothesis tests
\begin{align}
y_i &= \beta_0 + \beta_1 x_i + \varepsilon_i \qquad \varepsilon_i \sim \textrm{Normal}(0, \sigma^2) \\
\rho &= {
\sum_{i=1}^n (x_i - \bar x) (y_i - \bar y) \over
\sqrt{\sum_{i=1}^n (x_i - \bar x)^2}
\sqrt{\sum_{i=1}^n (y_i - \bar y)^2}}
= {s_{xy} \over s_x s_y} \\
\hat \beta_1 &= {
\sum_{i=1}^n (x_i - \bar x) (y_i - \bar y) \over
\sum_{i=1}^n (x_i - \bar x)^2}
= \rho \cdot {s_y \over s_x} \\
\hat \beta_0 &= \hat y - \hat \beta_1 \bar x \\
\hat y_i &= \hat \beta_0 + \hat \beta_1 x_i \\
e_i &= y_i - \hat y_i \\
\text{SSE} &= \sum_{i=1}^n (y_i - \hat y_i)^2 \\
\hat \sigma^2 &= \text{MSE} = {\sum_{i=1}^n (y_i - \hat y_i)^2 \over n - 2}
\end{align}
Confidence intervals for means and future predictions at $x^*$ are given by
\begin{align}
TODO
\end{align}
### Post-hoc tests for ANOVA
\begin{figure}[H]
\begin{center}
\begin{tabular}{lccccc}
name & null hypothesis & test statistic & null distribution & p-value \\
\hline
linear contrast on group means &
TODO &
TODO &
TODO &
TODO \\
tukey's honestly significant differences &
TODO &
TODO &
TODO &
TODO \\
\end{tabular}
\end{center}
\end{figure}
### Nonparametric tests
- Kruskal-Wallis
- Rank-Sum
- Signed-Rank
Bootstrap and permutation (parametric, one and two sample, paired)