Skip to content

Commit

Permalink
Make custom_detok settings backward compatible
Browse files Browse the repository at this point in the history
  • Loading branch information
eu9ene committed Dec 18, 2024
1 parent ea357e8 commit 633da72
Show file tree
Hide file tree
Showing 11 changed files with 33 additions and 26 deletions.
4 changes: 2 additions & 2 deletions contrib/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def benchmark(name, config, args):
"modifiers": [
{
"Tags": 0.1,
"custom_detok_trg": "moses:zh"
"custom_detok_trg": "zh"
}
]
}
Expand All @@ -85,7 +85,7 @@ def benchmark(name, config, args):
"modifiers": [
{
"Tags": 0.1,
"custom_detok_trg": "moses:zh",
"custom_detok_trg": "zh",
"spm_vocab": os.path.join(root, "test-data/vocab.zhen.spm")
}
]
Expand Down
4 changes: 2 additions & 2 deletions contrib/test_enzh_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ modifiers:
- TitleCase: 0.2
- Typos: 0.2
- Tags: 0.1
custom_detok_src: "moses:null" # Null value for the src detokenizer
custom_detok_trg: "moses:zh"
custom_detok_src: null # Null value for the src detokenizer
custom_detok_trg: zh
template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.
# We STRONGLY DISCOURAGE the modification of this line and in fact it shouldn't be included in the config
# unless you really know what you are doing.
Expand Down
4 changes: 2 additions & 2 deletions contrib/test_enzh_noise_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ modifiers:
- UpperCase: 0.2
- TitleCase: 0.2
- Tags: 0.1
custom_detok_src: "moses:null" # Null value for the src detokenizer
custom_detok_trg: "moses:zh"
custom_detok_src: null # Null value for the src detokenizer
custom_detok_trg: zh
augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise
replace: 0.4 # =====//=====, instead replace the target token with random noise, and use that random noise to tag a corresponding source word
# template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.
Expand Down
8 changes: 4 additions & 4 deletions contrib/test_enzh_tags_advanced_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ start:
- until clean 1
modifiers:
- Tags: 0.5
custom_detok_src: "moses:null"
custom_detok_trg: "moses:null"
custom_detok_src: null
custom_detok_trg: null
template: "{src} __target__ {trg} __done__"
- *modifiers

Expand All @@ -26,8 +26,8 @@ end:
- until clean 1
modifiers:
- Tags: 0.5
custom_detok_src: "moses:null"
custom_detok_trg: "moses:zh"
custom_detok_src: null
custom_detok_trg: zh
template: "{src} __target__ {trg} __done__"
- *modifiers

Expand Down
8 changes: 4 additions & 4 deletions contrib/test_enzh_tags_stage_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ start:
- until clean 1
modifiers:
- Tags: 0.5
custom_detok_src: "moses:null"
custom_detok_trg: "moses:null"
custom_detok_src: null
custom_detok_trg: null
template: "{src} __target__ {trg} __done__"

end:
Expand All @@ -22,8 +22,8 @@ end:
- until clean 1
modifiers:
- Tags: 0.5
custom_detok_src: "moses:null"
custom_detok_trg: "moses:zh"
custom_detok_src: null
custom_detok_trg: zh
template: "{src} __target__ {trg} __done__"

seed: 1111
Expand Down
4 changes: 2 additions & 2 deletions contrib/test_full_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ modifiers:
repeated_char: 0.1 # Repeats a random word character.
unichar: 0.1 # Replaces a random consecutive repeated letter with a single letter.
- Tags: 0.08
custom_detok_src: "moses:null"
custom_detok_trg: "moses:zh"
custom_detok_src: null
custom_detok_trg: zh
template: "__source__ {src} __target__ {trg} __done__"

seed: 1111
Expand Down
2 changes: 1 addition & 1 deletion contrib/test_zhen_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ start:
modifiers:
# No UpperCase or TitleCase modifier when the source is Chinese as we can't upper or lowercase Chinese
- Tags: 0.1
custom_detok_src: "moses:zh"
custom_detok_src: zh

seed: 1111
trainer: cat
4 changes: 2 additions & 2 deletions contrib/train_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ modifiers:
- UpperCase: 0.05
- TitleCase: 0.05
#- Tags: 0.08 # Requires dataset augmented with alignment info
# custom_detok_src: "moses:null" # Null value for the src detokenizer
# custom_detok_trg: "moses:zh"
# custom_detok_src: null # Null value for the src detokenizer
# custom_detok_trg: zh
# template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.
# We STRONGLY DISCOURAGE the modification of this line and in fact it shouldn't be included in the config
# unless you really know what you are doing.
Expand Down
7 changes: 7 additions & 0 deletions src/opustrainer/modifiers/placeholders.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,13 @@ def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None,
super().__init__(probability)

self.template = template

# uses Moses detokenizer by default
if custom_detok_src and ':' not in custom_detok_src:
custom_detok_src = f'moses:{custom_detok_src}'
if custom_detok_trg and ':' not in custom_detok_trg:
custom_detok_trg = f'moses:{custom_detok_trg}'

self.custom_detok_src = custom_detok_src
self.custom_detok_trg = custom_detok_trg

Expand Down
4 changes: 2 additions & 2 deletions src/opustrainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,8 +504,8 @@ def _load_modifiers(self, ymldata:dict, basepath:str) -> List[Modifier]:
- TitleCase: 0.05
- Tags: 0.02
num_tags: 6
custom_detok_src: "moses:null"
custom_detok_trg: "moses:zh"
custom_detok_src: null
custom_detok_trg: zh
```
"""
modifiers = [
Expand Down
10 changes: 5 additions & 5 deletions tests/test_placeholders.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ def test_retokenize(self):
retokenize the input, and update the alignments accordingly."""
tagger = PlaceholderTagModifier(
probability=0.25,
custom_detok_src='moses:en',
custom_detok_trg='moses:zh',
custom_detok_src='en',
custom_detok_trg='zh',
spm_vocab='contrib/test-data/vocab.zhen.spm') # type: ignore Path vs String type issue

output = tagger(['\t'.join([
Expand Down Expand Up @@ -124,8 +124,8 @@ def test_retokenize_on_non_trigger(self):
retokenize the input, even if probability is 0."""
tagger = PlaceholderTagModifier(
probability=0.0,
custom_detok_src='moses:en',
custom_detok_trg='moses:zh',
custom_detok_src='en',
custom_detok_trg='zh',
spm_vocab='contrib/test-data/vocab.zhen.spm') # type: ignore Path vs String type issue

output = tagger(['\t'.join([
Expand All @@ -148,7 +148,7 @@ def test_mode(self):
multiple modes are enabled."""
tagger = PlaceholderTagModifier(
probability=1.0,
custom_detok_src='moses:zh',
custom_detok_src='zh',
augment=0.33,
replace=0.33,
# tag=0.33 is implicit
Expand Down

0 comments on commit 633da72

Please sign in to comment.