fix

lena-voita · Jun 9, 2020 · c2eca49 · c2eca49
1 parent 5da56cb
commit c2eca49
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 24 deletions.
diff --git a/_data/papers/2018.yml b/_data/papers/2018.yml
@@ -5,7 +5,7 @@
   year: 2018
   img: ctx_anaphora-min.png
   title: "Context-Aware Neural Machine Translation Learns Anaphora Resolution"
-  authors: "<u>Elena Voita</u>, Pavel Serdyukov, Rico Sennrich and Ivan Titov"
+  authors: "<u>Elena Voita</u>, Pavel Serdyukov, Rico Sennrich, Ivan Titov"
   doc-url: //aclweb.org/anthology/P18-1117/
   arxiv: //arxiv.org/abs/1805.10163
   appendix: //anthology.aclweb.org/attachments/P/P18/P18-1117.Notes.pdf
@@ -36,7 +36,7 @@
   year: 2018
   img: blank.png
   title: "A Large-Scale Test Set for the Evaluation of Context-Aware Pronoun Translation in Neural Machine Translation"
-  authors: "Mathias M&uuml;ller, Annette Rios, <u>Elena Voita</u> and Rico Sennrich"
+  authors: "Mathias M&uuml;ller, Annette Rios, <u>Elena Voita</u>, Rico Sennrich"
   doc-url: //aclweb.org/anthology/W18-6307
   arxiv:
   url: "http://www.aclweb.org/anthology/W18-6307"

diff --git a/_data/papers/2019.yml b/_data/papers/2019.yml
@@ -10,6 +10,7 @@
   doc-url:
   conf: acl20.png
   url: "https://arxiv.org/abs/1910.13267"
+  abstract: "Subword segmentation is widely used to address the open vocabulary problem in machine translation. The dominant approach to subword segmentation is Byte Pair Encoding (BPE), which keeps the most frequent words intact while splitting the rare ones into multiple tokens. While multiple segmentations are possible even with the same vocabulary, BPE splits words into unique sequences; this may prevent a model from better learning the compositionality of words and being robust to segmentation errors. So far, the only way to overcome this BPE imperfection, its deterministic nature, was to create another subword segmentation algorithm (Kudo, 2018). In contrast, we show that BPE itself incorporates the ability to produce multiple segmentations of the same word. We introduce BPE-dropout - simple and effective subword regularization method based on and compatible with conventional BPE. It stochastically corrupts the segmentation procedure of BPE, which leads to producing multiple segmentations within the same fixed BPE framework. Using BPE-dropout during training and the standard BPE during inference improves translation quality up to 3 BLEU compared to BPE and up to 0.9 BLEU compared to the previous subword regularization."
 
 - layout: paper
   paper-type: inproceedings
@@ -21,6 +22,7 @@
   doc-url:
   conf: nips19.png
   url: "https://arxiv.org/abs/1911.00176"
+  abstract: "The dominant approach to sequence generation is to produce a sequence in some predefined order, e.g. left to right. In contrast, we propose a more general model that can generate the output sequence by inserting tokens in any arbitrary order. Our model learns decoding order as a result of its training procedure. Our experiments show that this model is superior to fixed order models on a number of sequence generation tasks, such as Machine Translation, Image-to-LaTeX and Image Captioning."
 
 -
   layout: paper
@@ -29,10 +31,11 @@
   year: 2019
   img: emnlp19_evolution-min.png
   title: "The Bottom-up Evolution of Representations in the Transformer: A Study with Machine Translation and Language Modeling Objectives"
-  authors: "<u>Elena Voita</u>, Rico Sennrich and Ivan Titov"
+  authors: "<u>Elena Voita</u>, Rico Sennrich, Ivan Titov"
   doc-url:
   conf: emnlp19.png
   url: "http://arxiv.org/abs/1909.01380"
+  abstract: "We seek to understand how the representations of individual tokens and the structure of the learned feature space evolve between layers in deep neural networks under different learning objectives. We focus on the Transformers for our analysis as they have been shown effective on various tasks, including machine translation (MT), standard left-to-right language models (LM) and masked language modeling (MLM). Previous work used black-box probing tasks to show that the representations learned by the Transformer differ significantly depending on the objective. In this work, we use canonical correlation analysis and mutual information estimators to study how information flows across Transformer layers and how this process depends on the choice of learning objective. For example, as you go from bottom to top layers, information about the past in left-to-right language models gets vanished and predictions about the future get formed. In contrast, for MLM, representations initially acquire information about the context around the token, partially forgetting the token identity and producing a more generalized token representation. The token identity then gets recreated at the top MLM layers."
 
 -
   layout: paper
@@ -41,9 +44,10 @@
   year: 2019
   img: emnlp19_ctx-min.png
   title: "Context-Aware Monolingual Repair for Neural Machine Translation"
-  authors: "<u>Elena Voita</u>, Rico Sennrich and Ivan Titov"
+  authors: "<u>Elena Voita</u>, Rico Sennrich, Ivan Titov"
   conf: emnlp19.png
   url: "http://arxiv.org/abs/1909.01383"
+  abstract: "Modern sentence-level NMT systems often produce plausible translations of isolated sentences. However, when put in context, these translations may end up being inconsistent with each other. We propose a monolingual DocRepair model to correct inconsistencies between sentence-level translations. DocRepair performs automatic post-editing on a sequence of sentence-level translations, refining translations of sentences in context of each other. For training, the DocRepair model requires only monolingual document-level data in the target language. It is trained as a monolingual sequence-to-sequence model that maps inconsistent groups of sentences into consistent ones. The consistent groups come from the original training data; the inconsistent groups are obtained by sampling round-trip translations for each isolated sentence. We show that this approach successfully imitates inconsistencies we aim to fix: using contrastive evaluation, we show large improvements in the translation of several contextual phenomena in an English-Russian translation task, as well as improvements in the BLEU score. We also conduct a human evaluation and show a strong preference of the annotators to corrected translations over the baseline ones. Moreover, we analyze which discourse phenomena are hard to capture using monolingual data only."
 
 -
   layout: paper
@@ -52,16 +56,15 @@
   year: 2019
   img: acl19_heads-min.png
   title: "Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy Lifting, the Rest Can Be Pruned"
-  authors: "<u>Elena Voita</u>, David Talbot, Fedor Moiseev, Rico Sennrich and Ivan Titov"
+  authors: "<u>Elena Voita</u>, David Talbot, Fedor Moiseev, Rico Sennrich, Ivan Titov"
   doc-url:
   conf: acl19.png
   arxiv: //arxiv.org/abs/1905.09418
   url: "http://www.aclweb.org/anthology/P19-1580"
   appendix:
   booktitle: "Proceedings of ACL"
   slides:
-  abstract: >
-      Multi-head self-attention is a key component of the Transformer, a state-of-the-art architecture for neural machine translation. In this work we evaluate the contribution made by individual attention heads in the encoder to the overall performance of the model and analyze the roles played by them. We find that the most important and confident heads play consistent and often linguistically-interpretable roles. When pruning heads using a method based on stochastic gates and a differentiable relaxation of the L0 penalty, we observe that specialized heads are last to be pruned. Our novel pruning method removes the vast majority of heads without seriously affecting performance. For example, on the English-Russian WMT dataset, pruning 38 out of 48 encoder heads results in a drop of only 0.15 BLEU.
+  abstract: "Multi-head self-attention is a key component of the Transformer, a state-of-the-art architecture for neural machine translation. In this work we evaluate the contribution made by individual attention heads in the encoder to the overall performance of the model and analyze the roles played by them. We find that the most important and confident heads play consistent and often linguistically-interpretable roles. When pruning heads using a method based on stochastic gates and a differentiable relaxation of the L0 penalty, we observe that specialized heads are last to be pruned. Our novel pruning method removes the vast majority of heads without seriously affecting performance. For example, on the English-Russian WMT dataset, pruning 38 out of 48 encoder heads results in a drop of only 0.15 BLEU."
   bibtex: >
 
     }
@@ -73,16 +76,15 @@
   year: 2019
   img: acl19_ctx_scheme-min.png
   title: "When a Good Translation is Wrong in Context: Context-Aware Machine Translation Improves on Deixis, Ellipsis, and Lexical Cohesion"
-  authors: "<u>Elena Voita</u>, Rico Sennrich and Ivan Titov"
+  authors: "<u>Elena Voita</u>, Rico Sennrich, Ivan Titov"
   doc-url:
   conf: acl19.png
   arxiv: //arxiv.org/abs/1905.05979
   url: "http://www.aclweb.org/anthology/P19-1116"
   appendix:
   booktitle: "Proceedings of ACL"
   slides:
-  abstract: >
-      Though machine translation errors caused by the lack of context beyond one sentence have long been acknowledged, the development of context-aware NMT systems is hampered by several problems. Firstly, standard metrics are not sensitive to improvements in consistency in document-level translations. Secondly, previous work on context-aware NMT assumed that the sentence-aligned parallel data consisted of complete documents while in most practical scenarios such document-level data constitutes only a fraction of the available parallel data. To address the first issue, we perform a human study on an English-Russian subtitles dataset and identify deixis, ellipsis and lexical cohesion as three main sources of inconsistency. We then create test sets targeting these phenomena. To address the second shortcoming, we consider a set-up in which a much larger amount of sentence-level data is available compared to that aligned at the document level. We introduce a model that is suitable for this scenario and demonstrate major gains over a context-agnostic baseline on our new benchmarks without sacrificing performance as measured with BLEU.
+  abstract: "Though machine translation errors caused by the lack of context beyond one sentence have long been acknowledged, the development of context-aware NMT systems is hampered by several problems. Firstly, standard metrics are not sensitive to improvements in consistency in document-level translations. Secondly, previous work on context-aware NMT assumed that the sentence-aligned parallel data consisted of complete documents while in most practical scenarios such document-level data constitutes only a fraction of the available parallel data. To address the first issue, we perform a human study on an English-Russian subtitles dataset and identify deixis, ellipsis and lexical cohesion as three main sources of inconsistency. We then create test sets targeting these phenomena. To address the second shortcoming, we consider a set-up in which a much larger amount of sentence-level data is available compared to that aligned at the document level. We introduce a model that is suitable for this scenario and demonstrate major gains over a context-agnostic baseline on our new benchmarks without sacrificing performance as measured with BLEU."
   bibtex: >
 
     }

diff --git a/_data/papers/2020.yml b/_data/papers/2020.yml
@@ -10,4 +10,5 @@
   doc-url:
   conf: no_conf.png
   url: "https://arxiv.org/abs/2003.12298"
+  abstract: "To measure how well pretrained representations encode some linguistic property, it is common to use accuracy of a probe, i.e. a classifier trained to predict the property from the representations. Despite widespread adoption of probes, differences in their accuracy fail to adequately reflect differences in representations. For example, they do not substantially favour pretrained representations over randomly initialized ones. Analogously, their accuracy can be similar when probing for genuine linguistic labels and probing for random synthetic tasks. To see reasonable differences in accuracy with respect to these random baselines, previous work had to constrain either the amount of probe training data or its model size. Instead, we propose an alternative to the standard probes, information-theoretic probing with minimum description length (MDL). With MDL probing, training a probe to predict labels is recast as teaching it to effectively transmit the data. Therefore, the measure of interest changes from probe accuracy to the description length of labels given representations. In addition to probe quality, the description length evaluates 'the amount of effort' needed to achieve the quality. This amount of effort characterizes either (i) size of a probing model, or (ii) the amount of data needed to achieve the high quality. We consider two methods for estimating MDL which can be easily implemented on top of the standard probing pipelines: variational coding and online coding. We show that these methods agree in results and are more informative and stable than the standard probes."
 
diff --git a/_includes/paper_lena.html b/_includes/paper_lena.html
@@ -2,12 +2,12 @@
     <a class="pull-right" >
     <img src="/img/paper/{{ include.paper.conf }}" alt="" style="max-width:50px; height:auto;"/>
   </a>
-  <a class="pull-left" {% if include.paper.doc-url %} href="{{ include.paper.url }}" {% endif %}>
+  <a class="pull-left" {% if include.paper.doc-url %} href="{{ include.paper.url }}" {% endif %}
+  {% if include.paper.abstract %} title="{{ include.paper.abstract }}"{% endif %}>
     <img src="/img/paper/{{ include.paper.img }}" alt="" style="max-width:120px; height:auto;"/>
   </a>
   <div class="media-body">
-    <!-- citation -->
-    {% if include.paper.logo %}<img src="/img/papers/{{ include.paper.logo }}" alt=""/>{% endif %}
+
     <a href="{{include.paper.url}}">{{ include.paper.title }}</a><br/>
     {% if include.paper.authors %}{{ include.paper.authors }}.{% endif %}
 

diff --git a/_layouts/photolist.html b/_layouts/photolist.html
@@ -16,13 +16,24 @@
     <!--[if lt IE 9]>
     <script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
     <![endif]-->
+
+      <style>
+        .label {
+          color: darkblue;
+          padding: 3px;
+          font-family: Arial;
+        }
+        .video {background-color: #c7db4d;}
+        .slides {background-color: #f0d043;}
+      </style>
+    <!-- #f7e17e #cbe691 -->
   </head>
   {% include head.html %}
   <body>
     {% include header.html %}
     <!-- JavaScript plugins (requires jQuery) -->
     <script src="http://code.jquery.com/jquery.js"></script>
-    <!-- Include all compiled plugins (below), or include individual files as needed --!>
+    <!-- Include all compiled plugins (below), or include individual files as needed -->
     <script src="js/bootstrap.min.js"></script>
 
     <div class="page-content">