Add merge and standalone noise modifiers

hplt-project · Nov 20, 2023 · 85ebe30 · 85ebe30
1 parent 452c4ea
commit 85ebe30
Show file tree

Hide file tree

Showing 10 changed files with 288 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -164,7 +164,28 @@ modifiers:
   random_space:  0.1 # Adds a random space in the string.
   repeated_char: 0.1 # Repeats a random word character.
   unichar:       0.1 # Replaces a random consecutive repeated letter with a single letter. 
-````
+```
+
+#### Merge
+Adds a modifier that merges up to `n` lines lines together. The idea is that sometimes we want to see longer sequences so that we are more robust. 
+
+```yaml
+modifiers:
+- Merge: 0.01
+  min_lines: 2 # Minimum lines to merge together
+  max_lines: 4 # Maximum lines to merge together
+```
+
+#### Noise
+Adds a noise modifier that inserts sentence pair containing identical random unicode noise on the source and target side. This is useful to teach the model to copy things it doesn't understand (IE notranslate).
+
+```yaml
+modifiers:
+- Noise: 0.01
+  min_word_length: 2 # Minimum word length for each word in the noisy sentence
+  max_word_length: 5 # Maximum word length for each word in the noisy sentence
+  max_words: 6 # Maximum number of words in each noisy sentence
+```
 
 #### Tags
 Adds a placeholder tag to the source sentence that can be used by the model to hint how it should translate that word. The word to hint is chosen at random from the target sentence. Only words with a 1-to-1 mapping between source and target are considered.
@@ -187,6 +208,28 @@ The `spm_vocab` option can be used to recompute the alignment info to match the
 
 The format for telling the translation model the intention to translate a word in a certain way can be controlled by `template`. Here `{src}` and `{trg}` are replaced by the selected words from the source and target side of the sentence pair.
 
+##### Replace
+Sometimes we want to just replace the source token with the target token directly, so during terminology inference the model doesn't try to think too hard what to do, but always places the hinted token on the target side. See `contrib/test_enzh_noise_config.yml` for example usage.
+
+```yml
+modifiers:
+  - Tags: 0.1
+    custom_detok_src: null # Null value for the src detokenizer
+    custom_detok_trg: zh
+    replace: 0.4 # 0.4 out of the time tags is triggered, instead replace the target token with random noise, and use that random noise to tag a corresponding source word.
+```
+
+##### Inline Noise
+If alignment information is present, we can augment the training data with inline unicode noise that appears at the appropriate location on both the source and the target. This is useful to teach the model to copy things it doesn't understand (IE notranslate). See `contrib/test_enzh_noise_config.yml` for example usage.
+
+```yml
+modifiers:
+  - Tags: 0.1
+    custom_detok_src: null # Null value for the src detokenizer
+    custom_detok_trg: zh
+    augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise. If you want 100% only noise without tag functionality use augment: 1
+```
+
 **Note**: Due to how most modifiers are implemented, they will have a normalising effect on spaces. Sequences of spaces will be collapsed into a single space. This is also true for the *Tags* modifier.
 
 **Note**: Even if the probability of the *Tags* modifier is set to 0, it will apply detokenisation and optionally re-computation of the alignment on every sentence pair, regardless whether it was picked out to be modified or not.

diff --git a/src/opustrainer/alignments.py b/src/opustrainer/alignments.py
@@ -1,4 +1,4 @@
-from typing import Optional, List
+from typing import Optional, List, Union
 from opustrainer.types import Pair, TokenList
 
 

diff --git a/src/opustrainer/modifiers/merge.py b/src/opustrainer/modifiers/merge.py
@@ -0,0 +1,69 @@
+# This file contains merge modifier and noise modifier
+import random
+from opustrainer.modifiers import Modifier
+
+import random
+from typing import List, Sequence, Union
+from opustrainer.modifiers import Modifier
+from opustrainer.alignments import format_alignments, parse_alignments, Pair
+
+def merge_sents(inputs: List[str]) -> str:
+    """Merges n sentences together, fixing up their alignments"""
+    srcs: List[List[str]] = [x.split('\t')[0].split() for x in inputs]
+    trgs: List[List[str]] = [x.split('\t')[1].split() for x in inputs]
+    align_txt: Union[str, None] = None
+    if len(inputs[0].split('\t')) > 2:
+        aligns: List[List[Pair]] = [parse_alignments(x.split('\t')[2].strip()) for x in inputs]
+
+        add_src = len(srcs[0])
+        add_trg = len(trgs[0])
+        for i in range(1, len(srcs)):
+            for j in range(len(aligns[i])):
+                aligns[i][j] = Pair(aligns[i][j][0] + add_src, aligns[i][j][1] + add_trg)
+            add_src = add_src + len(srcs[i])
+            add_trg = add_trg + len(trgs[i])
+
+        align_txt = format_alignments([item for sublist in aligns for item in sublist])
+
+    srcs_txt: str = " ".join([x.split('\t')[0] for x in inputs])
+    trgs_txt: str = " ".join([x.split('\t')[1] for x in inputs])
+
+    if align_txt is not None:
+        return srcs_txt + '\t' + trgs_txt + '\t' + align_txt
+    else:
+        return srcs_txt + '\t' + trgs_txt
+
+class MergeModifier(Modifier):
+    """Randomly merges up to n lines into one
+    
+        Usage:
+       ```yaml
+       modifiers:
+       - Merge: 0.01
+         min_lines: 2
+         max_lines: 4
+        ```
+    """
+    min_lines_merge: int
+    max_lines_merge: int
+    def __init__(self, probability: float=0.0, min_lines_merge: int=2, max_lines_merge: int=4):
+        super().__init__(probability)
+        self.min_lines_merge = min_lines_merge
+        self.max_lines_merge = max_lines_merge
+
+    def __call__(self, batch:List[str]) -> Sequence[str]:
+        newbatch: List[str] = []
+        # Identify merging candidates and their lengths
+        prev_end = -1
+        for i in range(len(batch)):
+            if i < prev_end:
+                continue
+            elif self.probability > random.random():
+                merge_end = i + random.randint(self.min_lines_merge, self.max_lines_merge)
+                prev_end = merge_end
+                merge_batch: str = merge_sents(batch[i:merge_end])
+                newbatch.append(merge_batch)
+            else:
+                newbatch.append(batch[i])
+
+        return newbatch
diff --git a/src/opustrainer/modifiers/noise.py b/src/opustrainer/modifiers/noise.py
@@ -0,0 +1,54 @@
+# This file contains merge modifier and noise modifier
+import random
+from opustrainer.modifiers import Modifier
+from opustrainer.modifiers.placeholders import get_random_unicode_words
+
+import random
+from typing import List, Sequence
+from opustrainer.modifiers import Modifier
+
+class NoiseModifier(Modifier):
+    """Adds noise during training. Nonsensitcal string on the source and on the target
+
+       Usage:
+       ```yaml
+       modifiers:
+       - Noise: 0.01
+         min_word_length: 2
+         max_word_length: 5
+         max_words: 6
+        ```
+    """
+    min_word_length: int
+    max_word_length: int
+    max_words: int
+
+    def __init__(self, probability: float=0.0, min_word_legnth: int=2,
+        max_word_length: int=5, max_words: int=6):
+        super().__init__(probability)
+        self.min_word_length = min_word_legnth
+        self.max_word_length = max_word_length
+        self.max_words = max_words
+
+    def __call__(self, batch:List[str]) -> Sequence[str]:
+        """Generates a random noise line"""
+        # The only problem is that we don't know if the dataset is supposed to have an alignment field
+        # or not... A tradeoff is to look at the previous line and see if it has alignment info and then follow that
+        # it's not ideal as we might hit a defective line, but oh well...
+        ret_batch: List[str] = []
+        for line in batch:
+            if self.probability > random.random():
+                newline: str =  " ".join(get_random_unicode_words(self.min_word_length, self.max_word_length, self.max_words))
+                # Check if we have a 3rd field, which we assume is alignment
+                if line.count('\t') == 2:
+                    # Generate alignments, just in case
+                    alignments: str = ""
+                    myrange = range(newline.count(' ') + 1)
+                    for j in myrange:
+                        alignments = alignments + str(j) + '-' + str(j) + " "
+                    alignments = alignments[:-1] # remove final space
+                    ret_batch.append(newline +'\t' + newline + '\t' + alignments)
+                else:
+                    ret_batch.append(newline +'\t' + newline)
+            ret_batch.append(line)
+        return ret_batch
diff --git a/src/opustrainer/modifiers/placeholders.py b/src/opustrainer/modifiers/placeholders.py
@@ -14,7 +14,7 @@
 
 def random_weighted_choice(options:Iterable[Tuple[T,float]]) -> T:
     choice = random.random()
-    cumsum = 0
+    cumsum = 0.0
     for option, prob in options:
         cumsum += prob
         if choice < cumsum:

diff --git a/src/opustrainer/modifiers/surface.py b/src/opustrainer/modifiers/surface.py
@@ -1,5 +1,5 @@
 import random
-from typing import Callable, Type, List, Iterable
+from typing import List, Iterable
 
 from opustrainer.modifiers import Modifier
 

diff --git a/src/opustrainer/trainer.py b/src/opustrainer/trainer.py
@@ -12,14 +12,16 @@
 import time
 
 from dataclasses import dataclass
-from typing import List, Tuple, Dict, Any, Optional, Union, Type, TextIO, cast, Iterable, Iterable, Callable, TypeVar, get_type_hints, get_args, get_origin
+from typing import List, Tuple, Dict, Any, Optional, Union, Type, TextIO, cast, Iterable, Iterable, TypeVar, get_type_hints, get_args, get_origin
 from tempfile import TemporaryFile
 from itertools import islice
 from pathlib import Path
 
 import yaml
 
 from opustrainer.modifiers import Modifier
+from opustrainer.modifiers.merge import MergeModifier
+from opustrainer.modifiers.noise import NoiseModifier
 from opustrainer.modifiers.prefix import PrefixModifier
 from opustrainer.modifiers.surface import UpperCaseModifier, TitleCaseModifier
 from opustrainer.modifiers.placeholders import PlaceholderTagModifier
@@ -39,6 +41,8 @@ def ignore_sigint():
 # Available batch modifiers
 # TODO: Import these lazy, on demand?
 MODIFIERS = {
+    'Merge': MergeModifier,
+    'Noise': NoiseModifier,
     'UpperCase': UpperCaseModifier,
     'TitleCase': TitleCaseModifier,
     'Tags': PlaceholderTagModifier,

diff --git a/tests/test_merge.py b/tests/test_merge.py
@@ -0,0 +1,79 @@
+from doctest import Example
+import random
+import unittest
+
+from opustrainer.modifiers.merge import MergeModifier, merge_sents
+
+class TestMerge(unittest.TestCase):
+  def setUp(self):
+    random.seed(1)
+
+    # Set up examples
+    self.example = [
+      '429 运输 中队 ( 429 野牛) , 使用 CC - 177	429 Transport Squadron (429 Bison Squadron) - Flying the CC-177	0-0 1-1 2-2 3-3 4-3 5-4 5-5 7-5 8-5 9-6 8-7 9-8 10-9',
+      "微生物 检验 与 食品 安全 控制 .	Food Poisoning and Food Hygiene.	3-0 0-1 1-1 2-1 2-2 3-3 4-3 5-4 6-4"
+    ]*10
+
+    self.example_noalign = ["\t".join(a.split('\t')[:-1]) for a in self.example]
+
+    # counts
+    self.psn_cnt = " ".join(self.example).count('Poisoning') # 10
+    self.num_cnt = " ".join(self.example).count('429') # 40 because it appears once in src and trg
+
+  def test_merge(self):
+    merged = merge_sents(self.example_noalign[0:3])
+    expected = '429 运输 中队 ( 429 野牛) , 使用 CC - 177 微生物 检验 与 食品 安全 控制 . 429 运输 中队 ( 429 野牛) , 使用 CC - 177\t429 Transport Squadron (429 Bison Squadron) - Flying the CC-177 Food Poisoning and Food Hygiene. 429 Transport Squadron (429 Bison Squadron) - Flying the CC-177'
+    self.assertEqual(merged, expected)
+
+    # Expected based on counts 
+    lensrc = sum([len(a.split('\t')[0].split()) for a in self.example_noalign[0:3]])
+    lentrg = sum([len(a.split('\t')[1].split()) for a in self.example_noalign[0:3]])
+
+    lenmrgsrc = len(merged.split('\t')[0].split())
+    lenmrgtrg = len(merged.split('\t')[1].split())
+    self.assertEqual(lensrc, lenmrgsrc)
+    self.assertEqual(lentrg, lenmrgtrg)
+
+  def test_merge_align(self):
+    merged = merge_sents(self.example[0:3])
+    expected = '429 运输 中队 ( 429 野牛) , 使用 CC - 177 微生物 检验 与 食品 安全 控制 . 429 运输 中队 ( 429 野牛) , 使用 CC - 177\t429 Transport Squadron (429 Bison Squadron) - Flying the CC-177 Food Poisoning and Food Hygiene. 429 Transport Squadron (429 Bison Squadron) - Flying the CC-177\t0-0 1-1 2-2 3-3 4-3 5-4 5-5 7-5 8-5 9-6 8-7 9-8 10-9 14-10 11-11 12-11 13-11 13-12 14-13 15-13 16-14 17-14 18-15 19-16 20-17 21-18 22-18 23-19 23-20 25-20 26-20 27-21 26-22 27-23 28-24'
+    self.assertEqual(merged, expected)
+
+    # Expected based on counts 
+    lensrc = sum([len(a.split('\t')[0].split()) for a in self.example[0:3]])
+    lentrg = sum([len(a.split('\t')[1].split()) for a in self.example[0:3]])
+
+    lenmrgsrc = len(merged.split('\t')[0].split())
+    lenmrgtrg = len(merged.split('\t')[1].split())
+    self.assertEqual(lensrc, lenmrgsrc)
+    self.assertEqual(lentrg, lenmrgtrg)
+
+    # Test alignment based on final letter
+    len_srcalign_final = len(merged.split('\t')[0].split())
+    len_trgalign_final = len(merged.split('\t')[1].split())
+    self.assertEqual(len_srcalign_final, 29)
+    self.assertEqual(len_trgalign_final, 25)
+
+  def test_merge_full(self):
+    merger = MergeModifier(0.8)
+    merged = merger(self.example_noalign)
+
+    psn_cnt = " ".join(merged).count('Poisoning')
+    num_cnt = " ".join(merged).count('429')
+
+    self.assertNotEqual(len(merged), len(self.example_noalign)) # Assert it being activated
+    self.assertEqual(self.psn_cnt, psn_cnt)
+    self.assertEqual(self.num_cnt, num_cnt)
+
+  def test_merge_full_align(self):
+    merger = MergeModifier(0.8)
+    merged = merger(self.example)
+
+    psn_cnt = " ".join(merged).count('Poisoning')
+    num_cnt = " ".join(merged).count('429')
+
+    self.assertNotEqual(len(merged), len(self.example)) # Assert it being activated
+    self.assertEqual(self.psn_cnt, psn_cnt)
+    self.assertEqual(self.num_cnt, num_cnt)
+
+
diff --git a/tests/test_noise.py b/tests/test_noise.py
@@ -0,0 +1,34 @@
+from doctest import Example
+import enum
+import random
+import unittest
+
+from opustrainer.modifiers.noise import NoiseModifier
+
+class TestMerge(unittest.TestCase):
+  def setUp(self):
+    random.seed(1)
+
+    # Set up examples
+    self.example = [
+      '429 运输 中队 ( 429 野牛) , 使用 CC - 177	429 Transport Squadron (429 Bison Squadron) - Flying the CC-177	0-0 1-1 2-2 3-3 4-3 5-4 5-5 7-5 8-5 9-6 8-7 9-8 10-9',
+      "微生物 检验 与 食品 安全 控制 .	Food Poisoning and Food Hygiene.	3-0 0-1 1-1 2-1 2-2 3-3 4-3 5-4 6-4"
+    ]*10
+
+    self.example_noalign = ["\t".join(a.split('\t')[:-1]) for a in self.example]
+
+    # With 20% prob this is triggered 3 times we check one of the matches. We expect new length to be 23
+    self.num_nine_noise = "쑥맜\t쑥맜"
+    self.num_nine_noise_align = "쑥맜\t쑥맜\t0-0"
+
+  def test_noise(self):
+    noiser = NoiseModifier(0.2)
+    noised = noiser(self.example_noalign)
+    self.assertEqual(noised[9], self.num_nine_noise)
+    self.assertEqual(len(noised), 23)
+
+  def test_noise_align(self):
+    noiser = NoiseModifier(0.2)
+    noised = noiser(self.example)
+    self.assertEqual(noised[9], self.num_nine_noise_align)
+    self.assertEqual(len(noised), 23)
diff --git a/tests/test_placeholders.py b/tests/test_placeholders.py
@@ -1,12 +1,10 @@
 import random
 import unittest
-import tempfile
 
 from textwrap import dedent
 
 from opustrainer.modifiers.placeholders import PlaceholderTagModifier
 from opustrainer.trainer import CurriculumLoader
-from opustrainer import logger
 
 
 def first(it):