Skip to content

Commit

Permalink
Add merge and standalone noise modifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
XapaJIaMnu committed Nov 20, 2023
1 parent 452c4ea commit 85ebe30
Show file tree
Hide file tree
Showing 10 changed files with 288 additions and 7 deletions.
45 changes: 44 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,28 @@ modifiers:
random_space: 0.1 # Adds a random space in the string.
repeated_char: 0.1 # Repeats a random word character.
unichar: 0.1 # Replaces a random consecutive repeated letter with a single letter.
````
```

#### Merge
Adds a modifier that merges up to `n` lines lines together. The idea is that sometimes we want to see longer sequences so that we are more robust.

```yaml
modifiers:
- Merge: 0.01
min_lines: 2 # Minimum lines to merge together
max_lines: 4 # Maximum lines to merge together
```

#### Noise
Adds a noise modifier that inserts sentence pair containing identical random unicode noise on the source and target side. This is useful to teach the model to copy things it doesn't understand (IE notranslate).

```yaml
modifiers:
- Noise: 0.01
min_word_length: 2 # Minimum word length for each word in the noisy sentence
max_word_length: 5 # Maximum word length for each word in the noisy sentence
max_words: 6 # Maximum number of words in each noisy sentence
```

#### Tags
Adds a placeholder tag to the source sentence that can be used by the model to hint how it should translate that word. The word to hint is chosen at random from the target sentence. Only words with a 1-to-1 mapping between source and target are considered.
Expand All @@ -187,6 +208,28 @@ The `spm_vocab` option can be used to recompute the alignment info to match the

The format for telling the translation model the intention to translate a word in a certain way can be controlled by `template`. Here `{src}` and `{trg}` are replaced by the selected words from the source and target side of the sentence pair.

##### Replace
Sometimes we want to just replace the source token with the target token directly, so during terminology inference the model doesn't try to think too hard what to do, but always places the hinted token on the target side. See `contrib/test_enzh_noise_config.yml` for example usage.

```yml
modifiers:
- Tags: 0.1
custom_detok_src: null # Null value for the src detokenizer
custom_detok_trg: zh
replace: 0.4 # 0.4 out of the time tags is triggered, instead replace the target token with random noise, and use that random noise to tag a corresponding source word.
```

##### Inline Noise
If alignment information is present, we can augment the training data with inline unicode noise that appears at the appropriate location on both the source and the target. This is useful to teach the model to copy things it doesn't understand (IE notranslate). See `contrib/test_enzh_noise_config.yml` for example usage.

```yml
modifiers:
- Tags: 0.1
custom_detok_src: null # Null value for the src detokenizer
custom_detok_trg: zh
augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise. If you want 100% only noise without tag functionality use augment: 1
```

**Note**: Due to how most modifiers are implemented, they will have a normalising effect on spaces. Sequences of spaces will be collapsed into a single space. This is also true for the *Tags* modifier.

**Note**: Even if the probability of the *Tags* modifier is set to 0, it will apply detokenisation and optionally re-computation of the alignment on every sentence pair, regardless whether it was picked out to be modified or not.
Expand Down
2 changes: 1 addition & 1 deletion src/opustrainer/alignments.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, List
from typing import Optional, List, Union
from opustrainer.types import Pair, TokenList


Expand Down
69 changes: 69 additions & 0 deletions src/opustrainer/modifiers/merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# This file contains merge modifier and noise modifier
import random
from opustrainer.modifiers import Modifier

import random
from typing import List, Sequence, Union
from opustrainer.modifiers import Modifier
from opustrainer.alignments import format_alignments, parse_alignments, Pair

def merge_sents(inputs: List[str]) -> str:
"""Merges n sentences together, fixing up their alignments"""
srcs: List[List[str]] = [x.split('\t')[0].split() for x in inputs]
trgs: List[List[str]] = [x.split('\t')[1].split() for x in inputs]
align_txt: Union[str, None] = None
if len(inputs[0].split('\t')) > 2:
aligns: List[List[Pair]] = [parse_alignments(x.split('\t')[2].strip()) for x in inputs]

add_src = len(srcs[0])
add_trg = len(trgs[0])
for i in range(1, len(srcs)):
for j in range(len(aligns[i])):
aligns[i][j] = Pair(aligns[i][j][0] + add_src, aligns[i][j][1] + add_trg)
add_src = add_src + len(srcs[i])
add_trg = add_trg + len(trgs[i])

align_txt = format_alignments([item for sublist in aligns for item in sublist])

srcs_txt: str = " ".join([x.split('\t')[0] for x in inputs])
trgs_txt: str = " ".join([x.split('\t')[1] for x in inputs])

if align_txt is not None:
return srcs_txt + '\t' + trgs_txt + '\t' + align_txt
else:
return srcs_txt + '\t' + trgs_txt

class MergeModifier(Modifier):
"""Randomly merges up to n lines into one
Usage:
```yaml
modifiers:
- Merge: 0.01
min_lines: 2
max_lines: 4
```
"""
min_lines_merge: int
max_lines_merge: int
def __init__(self, probability: float=0.0, min_lines_merge: int=2, max_lines_merge: int=4):
super().__init__(probability)
self.min_lines_merge = min_lines_merge
self.max_lines_merge = max_lines_merge

def __call__(self, batch:List[str]) -> Sequence[str]:
newbatch: List[str] = []
# Identify merging candidates and their lengths
prev_end = -1
for i in range(len(batch)):
if i < prev_end:
continue
elif self.probability > random.random():
merge_end = i + random.randint(self.min_lines_merge, self.max_lines_merge)
prev_end = merge_end
merge_batch: str = merge_sents(batch[i:merge_end])
newbatch.append(merge_batch)
else:
newbatch.append(batch[i])

return newbatch
54 changes: 54 additions & 0 deletions src/opustrainer/modifiers/noise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# This file contains merge modifier and noise modifier
import random
from opustrainer.modifiers import Modifier
from opustrainer.modifiers.placeholders import get_random_unicode_words

import random
from typing import List, Sequence
from opustrainer.modifiers import Modifier

class NoiseModifier(Modifier):
"""Adds noise during training. Nonsensitcal string on the source and on the target
Usage:
```yaml
modifiers:
- Noise: 0.01
min_word_length: 2
max_word_length: 5
max_words: 6
```
"""
min_word_length: int
max_word_length: int
max_words: int

def __init__(self, probability: float=0.0, min_word_legnth: int=2,
max_word_length: int=5, max_words: int=6):
super().__init__(probability)
self.min_word_length = min_word_legnth
self.max_word_length = max_word_length
self.max_words = max_words

def __call__(self, batch:List[str]) -> Sequence[str]:
"""Generates a random noise line"""
# The only problem is that we don't know if the dataset is supposed to have an alignment field
# or not... A tradeoff is to look at the previous line and see if it has alignment info and then follow that
# it's not ideal as we might hit a defective line, but oh well...
ret_batch: List[str] = []
for line in batch:
if self.probability > random.random():
newline: str = " ".join(get_random_unicode_words(self.min_word_length, self.max_word_length, self.max_words))
# Check if we have a 3rd field, which we assume is alignment
if line.count('\t') == 2:
# Generate alignments, just in case
alignments: str = ""
myrange = range(newline.count(' ') + 1)
for j in myrange:
alignments = alignments + str(j) + '-' + str(j) + " "
alignments = alignments[:-1] # remove final space
ret_batch.append(newline +'\t' + newline + '\t' + alignments)
else:
ret_batch.append(newline +'\t' + newline)
ret_batch.append(line)
return ret_batch
2 changes: 1 addition & 1 deletion src/opustrainer/modifiers/placeholders.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

def random_weighted_choice(options:Iterable[Tuple[T,float]]) -> T:
choice = random.random()
cumsum = 0
cumsum = 0.0
for option, prob in options:
cumsum += prob
if choice < cumsum:
Expand Down
2 changes: 1 addition & 1 deletion src/opustrainer/modifiers/surface.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import random
from typing import Callable, Type, List, Iterable
from typing import List, Iterable

from opustrainer.modifiers import Modifier

Expand Down
6 changes: 5 additions & 1 deletion src/opustrainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,16 @@
import time

from dataclasses import dataclass
from typing import List, Tuple, Dict, Any, Optional, Union, Type, TextIO, cast, Iterable, Iterable, Callable, TypeVar, get_type_hints, get_args, get_origin
from typing import List, Tuple, Dict, Any, Optional, Union, Type, TextIO, cast, Iterable, Iterable, TypeVar, get_type_hints, get_args, get_origin
from tempfile import TemporaryFile
from itertools import islice
from pathlib import Path

import yaml

from opustrainer.modifiers import Modifier
from opustrainer.modifiers.merge import MergeModifier
from opustrainer.modifiers.noise import NoiseModifier
from opustrainer.modifiers.prefix import PrefixModifier
from opustrainer.modifiers.surface import UpperCaseModifier, TitleCaseModifier
from opustrainer.modifiers.placeholders import PlaceholderTagModifier
Expand All @@ -39,6 +41,8 @@ def ignore_sigint():
# Available batch modifiers
# TODO: Import these lazy, on demand?
MODIFIERS = {
'Merge': MergeModifier,
'Noise': NoiseModifier,
'UpperCase': UpperCaseModifier,
'TitleCase': TitleCaseModifier,
'Tags': PlaceholderTagModifier,
Expand Down
79 changes: 79 additions & 0 deletions tests/test_merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from doctest import Example
import random
import unittest

from opustrainer.modifiers.merge import MergeModifier, merge_sents

class TestMerge(unittest.TestCase):
def setUp(self):
random.seed(1)

# Set up examples
self.example = [
'429 运输 中队 ( 429 野牛) , 使用 CC - 177 429 Transport Squadron (429 Bison Squadron) - Flying the CC-177 0-0 1-1 2-2 3-3 4-3 5-4 5-5 7-5 8-5 9-6 8-7 9-8 10-9',
"微生物 检验 与 食品 安全 控制 . Food Poisoning and Food Hygiene. 3-0 0-1 1-1 2-1 2-2 3-3 4-3 5-4 6-4"
]*10

self.example_noalign = ["\t".join(a.split('\t')[:-1]) for a in self.example]

# counts
self.psn_cnt = " ".join(self.example).count('Poisoning') # 10
self.num_cnt = " ".join(self.example).count('429') # 40 because it appears once in src and trg

def test_merge(self):
merged = merge_sents(self.example_noalign[0:3])
expected = '429 运输 中队 ( 429 野牛) , 使用 CC - 177 微生物 检验 与 食品 安全 控制 . 429 运输 中队 ( 429 野牛) , 使用 CC - 177\t429 Transport Squadron (429 Bison Squadron) - Flying the CC-177 Food Poisoning and Food Hygiene. 429 Transport Squadron (429 Bison Squadron) - Flying the CC-177'
self.assertEqual(merged, expected)

# Expected based on counts
lensrc = sum([len(a.split('\t')[0].split()) for a in self.example_noalign[0:3]])
lentrg = sum([len(a.split('\t')[1].split()) for a in self.example_noalign[0:3]])

lenmrgsrc = len(merged.split('\t')[0].split())
lenmrgtrg = len(merged.split('\t')[1].split())
self.assertEqual(lensrc, lenmrgsrc)
self.assertEqual(lentrg, lenmrgtrg)

def test_merge_align(self):
merged = merge_sents(self.example[0:3])
expected = '429 运输 中队 ( 429 野牛) , 使用 CC - 177 微生物 检验 与 食品 安全 控制 . 429 运输 中队 ( 429 野牛) , 使用 CC - 177\t429 Transport Squadron (429 Bison Squadron) - Flying the CC-177 Food Poisoning and Food Hygiene. 429 Transport Squadron (429 Bison Squadron) - Flying the CC-177\t0-0 1-1 2-2 3-3 4-3 5-4 5-5 7-5 8-5 9-6 8-7 9-8 10-9 14-10 11-11 12-11 13-11 13-12 14-13 15-13 16-14 17-14 18-15 19-16 20-17 21-18 22-18 23-19 23-20 25-20 26-20 27-21 26-22 27-23 28-24'
self.assertEqual(merged, expected)

# Expected based on counts
lensrc = sum([len(a.split('\t')[0].split()) for a in self.example[0:3]])
lentrg = sum([len(a.split('\t')[1].split()) for a in self.example[0:3]])

lenmrgsrc = len(merged.split('\t')[0].split())
lenmrgtrg = len(merged.split('\t')[1].split())
self.assertEqual(lensrc, lenmrgsrc)
self.assertEqual(lentrg, lenmrgtrg)

# Test alignment based on final letter
len_srcalign_final = len(merged.split('\t')[0].split())
len_trgalign_final = len(merged.split('\t')[1].split())
self.assertEqual(len_srcalign_final, 29)
self.assertEqual(len_trgalign_final, 25)

def test_merge_full(self):
merger = MergeModifier(0.8)
merged = merger(self.example_noalign)

psn_cnt = " ".join(merged).count('Poisoning')
num_cnt = " ".join(merged).count('429')

self.assertNotEqual(len(merged), len(self.example_noalign)) # Assert it being activated
self.assertEqual(self.psn_cnt, psn_cnt)
self.assertEqual(self.num_cnt, num_cnt)

def test_merge_full_align(self):
merger = MergeModifier(0.8)
merged = merger(self.example)

psn_cnt = " ".join(merged).count('Poisoning')
num_cnt = " ".join(merged).count('429')

self.assertNotEqual(len(merged), len(self.example)) # Assert it being activated
self.assertEqual(self.psn_cnt, psn_cnt)
self.assertEqual(self.num_cnt, num_cnt)


34 changes: 34 additions & 0 deletions tests/test_noise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from doctest import Example
import enum
import random
import unittest

from opustrainer.modifiers.noise import NoiseModifier

class TestMerge(unittest.TestCase):
def setUp(self):
random.seed(1)

# Set up examples
self.example = [
'429 运输 中队 ( 429 野牛) , 使用 CC - 177 429 Transport Squadron (429 Bison Squadron) - Flying the CC-177 0-0 1-1 2-2 3-3 4-3 5-4 5-5 7-5 8-5 9-6 8-7 9-8 10-9',
"微生物 检验 与 食品 安全 控制 . Food Poisoning and Food Hygiene. 3-0 0-1 1-1 2-1 2-2 3-3 4-3 5-4 6-4"
]*10

self.example_noalign = ["\t".join(a.split('\t')[:-1]) for a in self.example]

# With 20% prob this is triggered 3 times we check one of the matches. We expect new length to be 23
self.num_nine_noise = "쑥맜\t쑥맜"
self.num_nine_noise_align = "쑥맜\t쑥맜\t0-0"

def test_noise(self):
noiser = NoiseModifier(0.2)
noised = noiser(self.example_noalign)
self.assertEqual(noised[9], self.num_nine_noise)
self.assertEqual(len(noised), 23)

def test_noise_align(self):
noiser = NoiseModifier(0.2)
noised = noiser(self.example)
self.assertEqual(noised[9], self.num_nine_noise_align)
self.assertEqual(len(noised), 23)
2 changes: 0 additions & 2 deletions tests/test_placeholders.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import random
import unittest
import tempfile

from textwrap import dedent

from opustrainer.modifiers.placeholders import PlaceholderTagModifier
from opustrainer.trainer import CurriculumLoader
from opustrainer import logger


def first(it):
Expand Down

0 comments on commit 85ebe30

Please sign in to comment.