forked from dccuchile/CC6205
-
Notifications
You must be signed in to change notification settings - Fork 0
/
NLP-seq2seq.tex
816 lines (496 loc) · 25.8 KB
/
NLP-seq2seq.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
%\documentclass[mathserif]{beamer}
\documentclass[handout]{beamer}
%\usetheme{Goettingen}
%\usetheme{Warsaw}
\usetheme{Singapore}
%\usetheme{Frankfurt}
%\usetheme{Copenhagen}
%\usetheme{Szeged}
%\usetheme{Montpellier}
%\usetheme{CambridgeUS}
%\usecolortheme{}
%\setbeamercovered{transparent}
\usepackage[english, activeacute]{babel}
\usepackage[utf8]{inputenc}
\usepackage{amsmath, amssymb}
\usepackage{dsfont}
\usepackage{graphics}
\usepackage{cases}
\usepackage{graphicx}
\usepackage{pgf}
\usepackage{epsfig}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{amstext}
\usepackage[ruled,vlined,lined]{algorithm2e}
\usepackage{amsmath}
\usepackage{epic}
\usepackage{epsfig}
\usepackage{fontenc}
\usepackage{framed,color}
\usepackage{palatino, url, multicol}
%\algsetup{indent=2em}
\newcommand{\factorial}{\ensuremath{\mbox{\sc Factorial}}}
\newcommand{\BIGOP}[1]{\mathop{\mathchoice%
{\raise-0.22em\hbox{\huge $#1$}}%
{\raise-0.05em\hbox{\Large $#1$}}{\hbox{\large $#1$}}{#1}}}
\newcommand{\bigtimes}{\BIGOP{\times}}
\vspace{-0.5cm}
\title{Natural Language Processing \\ Sequence to Sequence Models, Attention and the Transformer}
\vspace{-0.5cm}
\author[Felipe Bravo Márquez]{\footnotesize
%\author{\footnotesize
\textcolor[rgb]{0.00,0.00,1.00}{Felipe Bravo-Marquez}}
\date{\today}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
\begin{frame}{Language Models and Language Generation}
\begin{scriptsize}
\begin{itemize}
\item Language modeling is the task of assigning a probability to sentences in a language.
\item Example: what is the probability of seeing the sentence ``the lazy dog barked loudly''?
\item The task can be formulated as the task of predicting the probability of seing a word conditioned on previous words:
\begin{displaymath}
P(w_i | w_1, w_2, \cdots, w_{i-1}) = \frac{P(w_1, w_2, \cdots, w_{i-1}, w_i)}{P(w_1, w_2, \cdots, w_{i-1})}
\end{displaymath}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Language Models and Language Generation}
\begin{scriptsize}
\begin{itemize}
\item RNNs can be used to train language models by tying the output at time $i$ with its input at time $i + 1$ .
\item This network can be used to generate sequences of words or random sentences.
\item Generation process: predict a probability distribution over the first word conditioned on the start symbol, and draw a random word according to the predicted
distribution.
\item Then predict a probability distribution over the second word conditioned on the first, and so on, until predicting the end-of-sequence $</$s$>$ symbol.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Language Models and Language Generation}
\begin{scriptsize}
\begin{itemize}
\item After predicting a distribution over the next output symbols $P(t_i = k | t_{1:i-1})$, a token $t_i$ is chosen and its corresponding embedding vector is fed as the input to the next step.
\begin{figure}[h]
\includegraphics[scale = 0.25]{pics/generator.png}
\end{figure}
\item Teacher-forcing: during \textbf{training} the generator is fed with the ground-truth previous word even if its own prediction put a small probability mass on it.
\item It is likely that the generator would have generated a different word at this state in \textbf{test time}.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Sequence to Sequence Problems}
\begin{scriptsize}
Nearly any task in NLP can be formulated as a sequence to sequence (or condionated generation) task i.e., generate output sequences from input ones. Input and output sequences can have different lengths.
\begin{itemize}
\item Machine Translation: source language to target language.
\item Summarization: long text to short text.
\item Dialogue (chatbots): previous utterances to next utterance.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Conditioned Generation}
\begin{scriptsize}
\begin{itemize}
\item While using the RNN as a generator is a cute exercise for demonstrating its strength, the power of RNN generator is really revealed when moving to a conditioned generation or enconder-decoder framework.
\item Core idea: using two RNNs.
\item Encoder: One RNN is used to encode the source input into a vector $\overrightarrow{c}$.
\item Decoder: Another RNN is used to decode the encoder's output and generate the target output.
\item At each stage of the generation process the context vector $\overrightarrow{c}$ is concatenated to the input $\hat{t}_j$ and the concatenation is fed into the RNN.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Encoder Decoder Framework}
\begin{figure}[h]
\includegraphics[scale = 0.32]{pics/seqseq.png}
\end{figure}
\end{frame}
\begin{frame}{Conditioned Generation}
\begin{scriptsize}
\begin{itemize}
\item This setup is useful for mapping sequences of length $n$ to sequences of length $m$.
\item The encoder summarizes the source sentence as a vector $\vec{c}$.
\item The decoder RNN is then used to predict (using a language modeling objective) the target sequence words conditioned on the previously predicted words as well as the encoded sentence $\vec{c}$.
\item The encoder and decoder RNNs are trained jointly.
\item The supervision happens only for the decoder RNN, but the gradients are propagated all the way back to the encoder RNN.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Sequence to Sequence Training Graph}
\begin{figure}[h]
\includegraphics[scale = 0.35]{pics/seq2se2train.png}
\end{figure}
\end{frame}
\begin{frame}{Neural Machine Translation}
\begin{figure}[h]
\includegraphics[scale = 0.25]{pics/mt.png}
\end{figure}
\end{frame}
\begin{frame}{Machine Translation BLEU progress over time}
\begin{figure}[h]
\includegraphics[scale = 0.35]{pics/nmt_progress.png}
\end{figure}
[Edinburgh En-De WMT]
\footnotetext{source: \url{http://www.meta-net.eu/events/meta-forum-2016/slides/09_sennrich.pdf}}
\end{frame}
\begin{frame}{Decoding Approaches}
\begin{scriptsize}
\begin{itemize}
\item The decoder aims to generate the output sequence with maximal score (or maximal probability), i.e., such that $\sum_{i=1}^{n}P(\hat{t}_i | \hat{t}_{1:i-1})$ is maximized.
\item The non-markovian nature of the RNN means that the probability function cannot be decomposed into factors that allow for exact search using standard dynamic programming.
\item Exact search: finding the optimum sequence requires evaluating every possible sequence (computationally prohibitive).
\item Thus, it only makes sense to solving the optimization problem above approximately.
\item Greedy search: choose the highest scoring prediction (word) at each step.
\item This may result in sub-optimal overall probability leading to
prefixes that are followed by low-probability events.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Greedy Search}
\begin{figure}[h]
\includegraphics[scale = 0.3]{pics/greedysearch.png}
\end{figure}
\footnotetext{Source: \cite{cho2015natural}}
\end{frame}
\begin{frame}{Beam Search}
\begin{scriptsize}
\begin{itemize}
\item Beam search interpolates between the exact search and the greedy search by changing the size $K$ of hypotheses maintained throughout the search procedure \cite{cho2015natural}.
\item The Beam search algorithm works in stages.
\item We first pick the $K$ starting words with the highest probability
\item At each step, each candidate sequence is expanded with all possible next steps.
\item Each candidate step is scored.
\item The $K$ sequences with the most likely probabilities are retained and all other candidates are pruned.
\item The search process can halt for each candidate separately either by reaching a maximum length, by reaching an end-of-sequence token, or by reaching a threshold likelihood.
\item The sentence with the highest overall probability is selected.
\end{itemize}
\footnotetext{More info at: \url{https://machinelearningmastery.com/beam-search-decoder-natural-language-processing/}}
\end{scriptsize}
\end{frame}
\begin{frame}{Conditioned Generation with Attention}
\begin{scriptsize}
\begin{itemize}
\item In the encoder-decoder networks the input sentence is encoded into a single vector, which is then used as a conditioning context for an RNN-generator.
\item This architectures forces the encoded vector $\vec{c}$ to contain all the information required for generation.
\item It doesn't work well for long sentences!
\item It also requires the generator to be able to extract this information from the fixed-length vector.
\item ``You can't cram the meaning of a whole \%\&!\$\# sentence into a single \$\&!\#* vector!'' -Raymond Mooney
\item This architecture can be can be substantially improved (in many cases it) by the addition of an attention mechanism.
\item The attention mechanism attempts to solve this problem by allowing the decoder to “look back” at the encoder’s hidden states based on its current state.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Conditioned Generation with Attention}
\begin{scriptsize}
\begin{itemize}
\item The input sentence (a length $n$ input sequence $\vec{x}_{1:n}$) is encoded using a biRNN as a sequence of vectors $\vec{c}_{1:n}$.
\item The decoder uses a soft attention mechanism in order to decide on which parts of the encoding input it should focus.
\item At each stage $j$ the decoder sees a weighted average of the vectors $\vec{c}_{1:n}$ , where the attention weights ($\vec{\alpha}^j$) are chosen by the attention mechanism.
\begin{displaymath}
\vec{c}^j = \sum_{i=1}^{n} \vec{\alpha}_{[i]}^{j}\cdot \vec{c}_i
\end{displaymath}
\item The elemements of $\vec{\alpha}^j$ are all positive and sum to one.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Conditioned Generation with Attention}
\begin{scriptsize}
\begin{itemize}
\item Unnormalized attention weights ($\bar{\alpha}_{[i]}^j$) are produced taking into account the decoder state at time $j$ ($\vec{s}_j$) and each of the vectors $\vec{c}_i$.
\item They can be obtained in various ways, basically any differentiable function returning a scalar out of two vectors $\vec{s}_j$ and $\vec{c}_i$ could be employed.
\item The simplest approach is a dot product: $\bar{\alpha}_{[i]}^j = \vec{s}_j \cdot \vec{c}_i$.
\item The one we will use in these slides is Additive attention, which uses a Multilayer Perceptron: $\bar{\alpha}_{[i]}^j = MLP^{att}([\vec{s}_j;\vec{c}_i]) = \vec{v} \cdot \operatorname{tanh}([\vec{s}_j;\vec{c}_i]U +\vec{b})$
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Conditioned Generation with Attention}
\begin{scriptsize}
\begin{itemize}
\item These unnormalized weights are then normalized into a probability distribution using the softmax function.
\begin{figure}[h]
\includegraphics[scale = 0.35]{pics/atten_formula.png}
\end{figure}
\item The encoder, decoder, and attention mechanism are all trained jointly in order to play well with each other.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Attention}
\begin{figure}[h]
\includegraphics[scale = 0.35]{pics/encdecattention.png}
\end{figure}
\end{frame}
\begin{frame}{Conditioned Generation with Attention}
\begin{scriptsize}
The entire sequence-to-sequence generation with attention is given by:
\begin{figure}[h]
\includegraphics[scale = 0.3]{pics/attentionformula.png}
\end{figure}
\end{scriptsize}
\end{frame}
\begin{frame}{Conditioned Generation with Attention}
\begin{scriptsize}
\begin{itemize}
\item Why use the biRNN encoder to translate the conditioning sequence $\vec{x}_{1:n}$ into the context vectors $\vec{c}_{1:n}$?
\item Why we just don't attend directly on the inputs (word embeddings) $MLP^{att}([\vec{s}_j;\vec{x}_i])$?
\item We could, but we get important benefits from the encoding process.
\item First, the biRNN vectors $\vec{c}_i$ represent the items $\vec{x}_i$ in their sentential context.
\item Sentential context: a window focused around the input item $\vec{x}_i$ and not the item itself.
\item Second, by having a trainable encoding component that is trained jointly with the decoder, the encoder and decoder evolve together.
\item Hence, the network can learn to encode relevant properties of the input that are useful for decoding, and that may not be present at the source sequence $\vec{x}_{1:n}$ directly.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Attention and Word Alignments}
\begin{scriptsize}
\begin{itemize}
\item In the context of machine translation, one can think of $MLP^{att}$ as computing a soft alignment between the current decoder state $\vec{s}_j$ (capturing the recently produced foreign words) and each of the source sentence components $\vec{c}_i$.
\end{itemize}
\begin{figure}[h]
\includegraphics[scale = 0.28]{pics/attention-alignment.png}
\caption{Source: \cite{cho2015describing}}
\end{figure}
\end{scriptsize}
\end{frame}
\begin{frame}{Other types of Attention}
\begin{scriptsize}
\begin{figure}[h]
\includegraphics[scale = 0.32]{pics/types_of_attention.png}
\caption{Source: \url{https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html}}
\end{figure}
\end{scriptsize}
\end{frame}
\begin{frame}{What's Wrong with RNNs?}
\begin{scriptsize}
\begin{itemize}
\item Given what we just learned above, it would seem like attention solves all the problems with RNNs and encoder-decoder architectures\footnote{The following material is based on: \url{http://mlexplained.com/2017/12/29/attention-is-all-you-need-explained/}}.
\item There are a few shortcomings of RNNs that another architecture called the \textbf{Transformer} tries to address.
\item The Transformer discards the recursive component of the Encoder-Decoder architecture and purely relies on attention mechanisms \cite{vaswani2017attention}.
\item When we process a sequence using RNNs, each hidden state depends on the previous hidden state.
\item This becomes a major pain point on GPUs: GPUs have a lot of computational capability and they hate having to wait for data to become available.
\item Even with technologies like CuDNN, RNNs are painfully inefficient and slow on the GPU.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Dependencies in neural machine translations}
\begin{scriptsize}
In essence, there are three kinds of dependencies in neural machine translations:
\begin{enumerate}
\item Dependencies between the input and output tokens.
\item Dependencies between the input tokens themselves.
\item Dependencies between the output tokens themselves.
\end{enumerate}
The traditional attention mechanism largely solved the first dependency by giving the decoder access to the entire input sequence. The second and third depedencies were addressed by the RNNs.
\end{scriptsize}
\end{frame}
\begin{frame}{The Transformer}
\begin{scriptsize}
\begin{itemize}
\item The novel idea of the Transformer is to extend this mechanism to the processing input and output sentences as well.
\item The RNN processes input sequences sequentialy.
\item The Transformer, on the other hand, allows the encoder and decoder to see the entire input sequence all at once.
\item This is done using attention.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Key Component: Multi-Head Attention}
\begin{scriptsize}
\begin{itemize}
\item The attention mechanism in the Transformer is interpreted as a way of computing the relevance of a set of \textbf{values} (information) based on some \textbf{keys} and \textbf{queries}.
\item The attention mechanism is used as a way for the model to \textbf{focus on relevant information} based on what it is currently processing.
\item In the RNN encoder-decoder architecture with attention:
\begin{enumerate}
\begin{scriptsize}
\item Attention weights were the relevance of the encoder hidden states (values) in processing the decoder state (query).
\item These values were calculated based on the encoder hidden states (keys) and the decoder hidden state (query).
\end{scriptsize}
\end{enumerate}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Key Component: Multi-Head Attention}
\begin{figure}[h]
\includegraphics[scale = 0.28]{pics/attention_concept.png}
\caption{In this example, the query is the word being decoded (which means dog) and both the keys and values are the source sentence. The attention score represents the relevance, and in this case is large for the word “dog” and small for others.}
\end{figure}
\end{frame}
\begin{frame}{The Key Component: Multi-Head Attention}
\begin{scriptsize}
\begin{itemize}
\item When we think of attention this way, we can see that the keys, values, and queries could be anything.
\item They could even be the same.
\item For instance, both values and queries could be input embeddings (self attention).
\item If we only computed a single attention weighted sum of the values, it would be difficult to capture various different aspects of the input.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Key Component: Multi-Head Attention}
\begin{scriptsize}
\begin{itemize}
\item For instance, in the sentence ``I like cats more than dogs'', you might want to capture the fact that the sentence compares two entities, while also retaining the actual entities being compared.
\item To solve this problem the Transformer uses the Multi-Head Attention block.
\item This block computes multiple attention weighted sums instead of a single attention pass over the values.
\item Hence the name ``Multi-Head'' Attention.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Key Component: Multi-Head Attention}
\begin{scriptsize}
\begin{itemize}
\item To learn diverse representations, the Multi-Head Attention applies different linear transformations to the values, keys, and queries for each “head” of attention.
\begin{figure}[h]
\includegraphics[scale = 0.48]{pics/multi_head_attention.png}
\end{figure}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Key Component: Multi-Head Attention}
\begin{scriptsize}
\begin{itemize}
\item A single attention applies a unique linear transformation to its input queries, keys, and values.
\item Then computes the attention score between each query and key.
\item The attention score is used to weight the values and sum them up.
\item The Multi-Head Attention block just applies multiple blocks in parallel, concatenates their outputs, then applies one single linear transformation.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Scaled Dot Product Attention}
\begin{scriptsize}
\begin{itemize}
\item Transformer uses a particular form of attention called the “Scaled Dot-Product Attention”:
\begin{displaymath}
\text{Attention}(Q,K,V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
\end{displaymath}
\item where $Q$ is the matrix of queries packed together and $K$ and $V$ are the matrices of keys and values packed together.
\item $d_k$ represents the dimensionality of the queries and keys.
\item The normalization over $\sqrt{d_k}$ is used to rescale the dot products between queries and keys (dot products tend to grow with the dimensionality).
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Transformer}
\begin{figure}[h]
\includegraphics[scale = 0.29]{pics/transformer.png}
\end{figure}
\end{frame}
\begin{frame}{The Transformer}
\begin{scriptsize}
\begin{itemize}
\item The Transformer still uses the basic encoder-decoder design of RNN neural machine translation systems.
\item The left-hand side is the encoder, and the right-hand side is the decoder.
\item The initial inputs to the encoder are the embeddings of the input sequence.
\item The initial inputs to the decoder are the embeddings of the outputs up to that point.
\item The encoder and decoder are composed of $N$ blocks (where $N = 6$ for both networks).
\item These blocks are composed of smaller blocks as well.
\item Let's look at each block in further detail.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Encoder}
\begin{scriptsize}
\begin{itemize}
\item The encoder contains self-attention layers.
\item In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder.
\item Each position in the encoder can attend to all positions in the previous layer of the encoder.
\item The encoder is composed of two blocks (which we will call sub-layers to distinguish from the $N$ blocks composing the encoder and decoder).
\item One is the Multi-Head Attention sub-layer over the inputs, mentioned above.
\item The other is a simple feed-forward network.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Encoder}
\begin{scriptsize}
\begin{itemize}
\item Between each sub-layer, there is a residual connection followed by a layer normalization.
\item A residual connection is basically just taking the input and adding it to the output of the sub-network, and is a way of making training deep networks easier.
\item Layer normalization is a normalization method in deep learning that is similar to batch normalization.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Encoder}
\begin{figure}[h]
\includegraphics[scale = 0.39]{pics/transformerencoder.png}
\end{figure}
\end{frame}
\begin{frame}{The Encoder}
\begin{scriptsize}
\begin{itemize}
\item What each encoder block is doing is actually just a bunch of matrix multiplications followed by a couple of element-wise transformations.
\item This is why the Transformer is so fast: everything is just parallelizable matrix multiplications.
\item The point is that by stacking these transformations on top of each other, we can create a very powerful network.
\item The core of this is the attention mechanism which modifies and attends over a wide range of information.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Decoder}
\begin{scriptsize}
\begin{itemize}
\item The decoder has ``encoder-decoder attention'' layers in which the queries come from the previous decoder layer,and the memory keys and values come from the output of the encoder.
\item This allows every position in the decoder to attend over all positions in the input sequence.
\item This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models.
\item Self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position.
\item This plays a similar role to the decoder hidden state in RNN machine translation architectures.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Decoder}
\begin{scriptsize}
\begin{itemize}
\item When we train the Transformer, we want to process all the sentences at the same time.
\item When we do this, if we give the decoder access to the entire target sentence, the model can just repeat the target sentence (in other words, it doesn't need to learn anything).
\item To prevent this from happening, the decoder masks the ``future'' tokens when decoding a certain word.
\item The masking is done by setting to $- \infty$ all values in the input of the softmax which correspond to illegal connections.
\item This is why ``multi-head attention blocks'' in the decoder are referred to as ``masked'': the inputs to the decoder from future time-steps ared masked.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Decoder}
\begin{figure}[h]
\includegraphics[scale = 0.29]{pics/transformerdecoder.png}
\end{figure}
\end{frame}
\begin{frame}{Positional Encodings}
\begin{scriptsize}
\begin{itemize}
\item Unlike recurrent networks, the multi-head attention network cannot naturally make use of the position of the words in the input sequence.
\item Without positional encodings, the output of the multi-head attention network would be the same for the sentences ``I like cats more than dogs'' and ``I like dogs more than cats''.
\item Positional encodings explicitly encode the relative/absolute positions of the inputs as vectors and are then added to the input embeddings.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Positional Encodings}
\begin{scriptsize}
\begin{itemize}
\item The paper uses the following equation to compute the positional encodings: \\
$PE(pos,2i) = \sin(pos/10000^{2i/d_{model}})$
$PE(pos,2i+1) = \cos(pos/10000^{2i/d_{model}})$
\item Where $pos$ represents the position, and $i$ is the dimension.
\item Basically, each dimension of the positional encoding is a wave with a different frequency.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Conclusions}
\begin{scriptsize}
\begin{itemize}
\item The Transformer achieves better BLUE scores than previous state-of-the-art models for English-to-German translation and English-to-French translation at a fraction of the training cost.
\begin{figure}[h]
\includegraphics[scale = 0.29]{pics/transformerresults.png}
\end{figure}
\item The Transformer is a powerful and efficient alternative to recurrent neural networks to model dependencies using only attention mechanisms.
\item A very illustrative blog post about the Transformer: \url{http://jalammar.github.io/illustrated-transformer/}.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}
\frametitle{Questions?}
%\vspace{1.5cm}
\begin{center}\LARGE Thanks for your Attention!\\ \end{center}
\end{frame}
\begin{frame}[allowframebreaks]\scriptsize
\frametitle{References}
\bibliography{bio}
\bibliographystyle{apalike}
%\bibliographystyle{flexbib}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}