-
Notifications
You must be signed in to change notification settings - Fork 0
/
presentation.tex
379 lines (331 loc) · 12.7 KB
/
presentation.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Beamer Presentation
% LaTeX Template
% Version 1.0 (10/11/12)
%
% This template has been downloaded from:
% http://www.LaTeXTemplates.com
%
% License:
% CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/)
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%----------------------------------------------------------------------------------------
% PACKAGES AND THEMES
%----------------------------------------------------------------------------------------
\documentclass{beamer}
\mode<presentation> {
\usepackage[utf8]{inputenc} %unicode support
% The Beamer class comes with a number of default slide themes
% which change the colors and layouts of slides. Below this is a list
% of all the themes, uncomment each in turn to see what they look like.
%\usetheme{default}
%\usetheme{AnnArbor}
%\usetheme{Antibes}
%\usetheme{Bergen}
%\usetheme{Berkeley}
%\usetheme{Berlin}
%\usetheme{Boadilla}
%\usetheme{CambridgeUS}
%\usetheme{Copenhagen}
%\usetheme{Darmstadt}
%\usetheme{Dresden}
%\usetheme{Frankfurt}
%\usetheme{Goettingen}
%\usetheme{Hannover}
%\usetheme{Ilmenau}
%\usetheme{JuanLesPins}
%\usetheme{Luebeck}
\usetheme{Madrid}
%\usetheme{Malmoe}
%\usetheme{Marburg}
%\usetheme{Montpellier}
%\usetheme{PaloAlto}
%\usetheme{Pittsburgh}
%\usetheme{Rochester}
%\usetheme{Singapore}
%\usetheme{Szeged}
%\usetheme{Warsaw}
% As well as themes, the Beamer class has a number of color themes
% for any slide theme. Uncomment each of these in turn to see how it
% changes the colors of your current slide theme.
%\usecolortheme{albatross}
%\usecolortheme{beaver}
%\usecolortheme{beetle}
%\usecolortheme{crane}
%\usecolortheme{dolphin}
%\usecolortheme{dove}
%\usecolortheme{fly}
%\usecolortheme{lily}
%\usecolortheme{orchid}
%\usecolortheme{rose}
%\usecolortheme{seagull}
%\usecolortheme{seahorse}
%\usecolortheme{whale}
%\usecolortheme{wolverine}
%\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
%\setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
%\setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
}
\usepackage{graphicx} % Allows including images
\usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
%----------------------------------------------------------------------------------------
% TITLE PAGE
%----------------------------------------------------------------------------------------
\title[Alignment-free tools]{Alignment-free tools for metagenomics-data analysis } % The short title appears at the bottom of every slide, the full title is only on the title page
\author{Robert Deibel} % Your name
\institute[Universität Tübingen] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
{
Eberhard-Karls Universität Tübingen \\ % Your institution for the title page
\medskip
\textit{[email protected]} % Your email address
}
\date{\today} % Date, can be changed to a custom date
\begin{document}
\begin{frame}
\titlepage % Print the title page as the first slide
\end{frame}
\begin{frame}
\frametitle{Overview} % Table of contents slide, comment this block out to remove it
\tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
\end{frame}
%----------------------------------------------------------------------------------------
% PRESENTATION SLIDES
%----------------------------------------------------------------------------------------
%------------------------------------------------
\section{Metagenomics} % Sections can be created in order to organize your presentation into discrete blocks, all sections and subsections are automatically printed in the table of contents as an overview of the talk
%------------------------------------------------
\begin{frame}
\frametitle{Metagenomics}
\subsection{Metagenome}
\begin{block}{Metagenome}
\begin{itemize}
\item A metagenome is the whole set of transcripts found in a sample.
\item Metagenomics is the study of those
\item $>90\%$ uncultureable microorganisms
\item design of antibiotics, analysis of microorganismal life
\end{itemize}
\end{block}
\subsection{NGS and alignment}
\begin{block}{NGS and alignment}
\begin{itemize}
\item Advances in sequencing made metagenomics possible
\item NGS generates comparable reads
\end{itemize}
\end{block}
\end{frame}
%------------------------------------------------
\begin{frame}
\frametitle{Metagenomics}
\begin{block}{Goals}
\begin{itemize}
\item insight in microorganismal life
\item first evidence of origin and function
\item independent from databases and coding regions
\end{itemize}
\end{block}
\end{frame}
%------------------------------------------------
\section{Alignment-based approach}
\begin{frame}
\frametitle{Alignment-based approach}
\uncover<1,2>{\begin{block}{Advantages}
\begin{itemize}
\item Align sequences against database
\item Profiles can be analyzed
\item BLAST $>80\%$ accuracy
\end{itemize}
\end{block}}
\uncover<2>{\begin{block}{Disadvantages}
\begin{itemize}
\item Low speed
\item Dependent of databases
\item Unsequenced transcripts cannot be matched
\item Databases mostly consist of coding sequences
\end{itemize}
\end{block}}
\end{frame}
%-----------------------------------------------
\section{Alignment-free methods}
\begin{frame}
\frametitle{Different approaches}
\begin{block}{Statistics}
\begin{itemize}
\item Utilizes statistics differing in power
\item based on $k$-tuple counts
\item Have to be applied through (dis)similarity matrix
\item Further analysis follows
\end{itemize}
\end{block}
\begin{block}{Machine learning}
\begin{itemize}
\item Optimization of a function
\item based on $k$-mer signature
\item applies BH-SNE
\item Visualizes data in scatter plots
\end{itemize}
\end{block}
\end{frame}
%------------------------------------------------
\subsection{Statistics as similarity measurement}
\begin{frame}
\frametitle{Measuring similarity}
$$D_2=\sum_{w\in \mathcal{A}^k}X_wY_w$$
where:
\begin{itemize}
\item $X_w$, $Y_w$ number of occurrences in $A$, $B$
\item $\mathcal{A}$ alphabet
\item $k$ is length of $w$
\end{itemize}
\uncover<2>{\begin{block}{Problem}
$D_2$ is not normalized $\Rightarrow$ results vary on different factors
\end{block}}
\end{frame}
%-----------------------------------------------
\begin{frame}
\frametitle{Measuring similarity}
Assume sequences are generated through Markov chain\\
\uncover<2,3,4,5>{$$D2z(A,B)=\frac{D_2(A,B)-E(D_2)}{\sqrt{Var(D_2)}}$$}%
\begin{itemize}
\uncover<3,4,5>{\item compared to five other measures $D2z$ outperformed them}\\
\uncover<4,5>{\item needs two parameters, $k$ and $r$}
\uncover<5>{\item $E(D_2)$ and $Var(D_2)$ calculated with Markov chain in mind}
\end{itemize}
\end{frame}
%-----------------------------------------------
\begin{frame}
\frametitle{Calculating the expected value}
\begin{block}{MC of order zero}
Probability of $A=B$ or\dots\\
\uncover<2,3,4,5>{Sum of background probabilities $f_a^A$, $f_a^B$ to the power of $k$}
\uncover<3,4,5>{$$E(D_2)=\left(\sum_{a\in \mathcal{A}}f_a^Af_a^B\right)^k$$}
\end{block}
\uncover<4,5>{\begin{block}{MC of order 1}
Sum of probabilities for $|w|=k$, under consideration of MC\\
\uncover<5>{$$\sum_{|w|=k}p^A(w_1)p^A(w|w_1)p^B(w_1)p^B(w|w_1)$$}
\end{block}}
\end{frame}
%------------------------------------------------
\subsection{CVTree}
\begin{frame}
\frametitle{Constructing phylogenetic trees -- CVTree}
\begin{itemize}
\item considers (k-2)-th order MC estimated by $E^X_w$
\uncover<2,3,4>{$$Hao=\frac{1}{2}(1-C)$$}
\uncover<3,4>{$$C=\frac{\sum_w\left(\frac{X_w-E_w^X}{E_w^X}\right)\left(\frac{Y_w-E_w^Y}{E_w^Y}\right)}{\sqrt{\sum_w\left(\frac{X_w-E_w^X}{E_w^X}\right)^2\sum_w\left(\frac{Y_w-E_w^Y}{E_w^Y}\right)^2}}$$}
\uncover<4>{\item Observations in composition vector
\item subtraction of background "noise" through MC
\item $C$ is cosine between vectors}
\end{itemize}
\end{frame}
\begin{frame}\begin{figure}
\includegraphics[width=\linewidth]{bilder/CVTree.png}
\caption{Computed phylogenetic tree through application of neighbor joining dissimilarity matrix}
\end{figure}
\end{frame}
%--------------------------------------------------
\begin{frame}
\frametitle{Nucleotide frequency}
\begin{itemize}
\item Related approach to $Hao$
\item Consider di-nucleotide frequency
$$\rho_{ab}(A)=\frac{f_{ab}}{f_af_b}$$
\item Can be extended to tri- and tetra nucleotides
\item $l_p$ norm as dissimilarity measure
$$\delta(A,B)=\sum_{ab\in A}|\rho_{ab}(A)-\rho_{ab}(B)|$$
\end{itemize}
\end{frame}
%------------------------------------------------
\subsection{$D_2^S$, $D_2^*$ and their normalization}
\begin{frame}
\frametitle{$D_2^S$, $D_2^*$ and their normalization}
\begin{itemize}
\item For two normal random variables $XY/\sqrt{X^2+Y^2}$ is also normally distributed
\uncover<2,3,4>{$$D_2^S=\sum_{w\in \mathcal{A}^k}\frac{\widetilde{X}_w\widetilde{Y}_w}{\sqrt{\widetilde{X}_w^2+\widetilde{Y}_w^2}}$$}%
\uncover<3,4>{\item$D_2^*$ utilizes that number of occurrences is approximately Poisson; mean and variance are the same}
\uncover<4>{\begin{block}{Conclusions}
\begin{enumerate}
\item $D_2^S$ and $D_2^*$ have higher power than $D_2$
\item $D_2^*$ has highest power when $k$ equals $motif$ length
\item $D_2^*$ has higher power for short sequences\\
\end{enumerate}
but again both not normalized
\end{block}}
\end{itemize}
\end{frame}
%------------------------------------------------
\begin{frame}
\frametitle{Normalization and Neighborhood}
\begin{block}{Normalization}
\begin{itemize}
\item Normalization to $d_2^S$ and $d_2^*$
\item 0 when sequences are the same and close to one if anti-correlated
\item Now applicable to metagenomic data or varying types of sequences
\end{itemize}
\end{block}
%-----------------------------------------------
\subsection{Consideration of mismatches}
\uncover<2>{\begin{block}{Consideration of mismatches}
\begin{itemize}
\item instead of $w$ the statistics should consider the neighborhood $\varsigma(w)$
\item If $w' \in \varsigma(w)$ $w'$ has a certain number of mismatches with $w$
\item Reverse complement can be included similarly
\item statistics can then be modified
\end{itemize}
\end{block}}
\end{frame}
%-----------------------------------
\begin{frame}
\frametitle{Performance under the mismatch model}
\begin{block}{Test parameters}
\begin{itemize}
\item sequences from mouse embryo
\item positive and negative set maximum of 30\% repetitions
\item Dissimilarity was calculated
\item Threshold was applied
\item prediction of dissimilarity lower than threshold resulted in positives
\item Predictions were compared to real data
\item Testing with different parameters for $k$, $r$ and mismatch weight
\end{itemize}
\end{block}
\uncover<2>{\begin{block}{Test conclusions}
\begin{itemize}
\item $Hao$ performed worse than $d_2^S$ and $d_2^*$
\item $d_2^S$ and $d_2^*$ performed best with mismatch weight of 0.05 and $k=4$
\item Overall $d_2^S$ achieved best results in testing
\end{itemize}
\end{block}}
\end{frame}
\subsection{Machine learning -- BH-SNE}
\begin{frame}
\frametitle{Machine learning -- BH-SNE}
\begin{itemize}
\item Applies Barnes-hut and vantage point trees
\item Utilizes species-specific oligonucleotide signatures as $k$-mers
\item $k$-mers are represented as vectors in high-dimensional Euclidean space
\item Transformation enables human interpretation in scatter plots
\end{itemize}
\end{frame}
%------------------------------------------------
\begin{frame}[fragile] % Need to use the fragile option when verbatim is used in the slide
\frametitle{Citation}
An example of the \verb|\cite| command to cite within the presentation:\\~
This statement requires citation \cite{p1}.
\end{frame}
%------------------------------------------------
\begin{frame}
\frametitle{References}
\footnotesize{
\begin{thebibliography}{99} % Beamer does not support BibTeX so references must be inserted manually as below
\bibitem[Smith, 2012]{p1} John Smith (2012)
\newblock Title of the publication
\newblock \emph{Journal Name} 12(3), 45 -- 678.
\end{thebibliography}
}
\end{frame}
%------------------------------------------------
\begin{frame}
\Huge{\centerline{The End}}
\end{frame}
%----------------------------------------------------------------------------------------
\end{document}