hplt-project · jelmervdl · Aug 14, 2023 · Apr 3, 2023 · Apr 3, 2023 · Apr 4, 2023
diff --git a/README.md b/README.md
@@ -165,21 +165,28 @@ modifiers:
 #### Tags
 Adds a placeholder tag to the source sentence that can be used by the model to hint how it should translate that word. The word to hint is chosen at random from the target sentence. Only words with a 1-to-1 mapping between source and target are considered.
 
-This modifier needs a third column in the training data with per-word alignment information.
+This modifier needs a third column in the training data with per-word (technically: space separated token) alignment information.
 
 ```yaml
 - Tags: 0.05
   custom_detok_src: null
   custom_detok_trg: zh
+  spm_vocab: path/to/vocab.enzh.spm
   template: "__source__ {src} __target__ {trg} __done__"
 ```
 
 All options are optional.
 
 You can specify custom detokenizer languages using `custom_detok_src` and `custom_detok_trg` if the dataset you're reading from has been tokenized by the Moses tokenizer. This can be helpful to do for languages that do not use spaces to delimit words. The default tokenisation strategy is splitting/joining by spaces.
 
+The `spm_vocab` option can be used to recompute the alignment info to match the tokenisation from the sentencepiece vocabulary. This is mostly useful for Marian, which takes untokenised input but expects the alignment info to match the sentencepiece tokenisation it performs.
+
 The format for telling the translation model the intention to translate a word in a certain way can be controlled by `template`. Here `{src}` and `{trg}` are replaced by the selected words from the source and target side of the sentence pair.
 
+**Note**: Due to how most modifiers are implemented, they will have a normalising effect on spaces. Sequences of spaces will be collapsed into a single space. This is also true for the *Tags* modifier.
+
+**Note**: Even if the probability of the *Tags* modifier is set to 0, it will apply detokenisation and optionally re-computation of the alignment on every sentence pair, regardless whether it was picked out to be modified or not.
+
 #### Prefix
 Prepends a random subsection of the target sentence before the source sentence. 
 

diff --git a/contrib/test-data/vocab.zhen.spm b/contrib/test-data/vocab.zhen.spm
diff --git a/src/opustrainer/alignments.py b/src/opustrainer/alignments.py
@@ -0,0 +1,25 @@
+from typing import Optional, List
+from opustrainer.types import Pair, TokenList
+
+
+def parse_alignments(pairs:str, src_tokens:Optional[TokenList]=None, trg_tokens:Optional[TokenList]=None) -> List[Pair]:
+    pairs = [
+        Pair(int(a), int(b)) for a, b in
+        (pair.split('-', maxsplit=1) for pair in pairs.split())
+    ]
+
+    if src_tokens is not None and trg_tokens is not None:
+        invalid_pairs = [
+            pair
+            for pair in pairs
+            if pair.src < 0 or pair.src >= len(src_tokens)
+            or pair.trg < 0 or pair.trg >= len(trg_tokens)
+        ]
+        if invalid_pairs:
+            raise ValueError('Out-of-bound alignment pairs: ' + ' '.join(map(repr, invalid_pairs)))
+
+    return pairs
+
+
+def format_alignments(pairs:List[Pair]) -> str:
+    return ' '.join(f'{pair.src}-{pair.trg}' for pair in pairs)
diff --git a/src/opustrainer/modifiers/__init__.py b/src/opustrainer/modifiers/__init__.py
@@ -1,16 +1 @@
-from abc import ABC, abstractmethod
-from typing import Dict, Any, List
-
-
-class Modifier(ABC):
-    probability: float
-
-    def __init__(self, probability:float, **kwargs:Dict[str,Any]):
-        self.probability = probability
-
-    def validate(self, context:List['Modifier']) -> None:
-        pass
-
-    @abstractmethod
-    def __call__(self, line: str) -> str:
-        pass
+from opustrainer.types import Modifier
diff --git a/src/opustrainer/modifiers/placeholders.py b/src/opustrainer/modifiers/placeholders.py
diff --git a/src/opustrainer/modifiers/retokenize.py b/src/opustrainer/modifiers/retokenize.py
@@ -0,0 +1,76 @@
+from typing import List, Protocol, Dict, NamedTuple, TypeVar, Callable, Union, Tuple, Optional, Any
+from itertools import count
+
+from opustrainer.types import Pair, TokenList, TokenMapping, Tokenizer, Detokenizer
+from opustrainer.alignments import parse_alignments, format_alignments
+from opustrainer.tokenizers import make_tokenizer, make_detokenizer
+from opustrainer.modifiers import Modifier
+from opustrainer import logger
+
+
+def overlaps(r1:slice, r2:slice) -> bool:
+    """True if slice 1 (partially or fully) overlaps with slice 2."""
+    # (a,b), (x,y) = r1, r2
+    #      [a    b]             | a < y |  x < b
+    # [x y]                 = F |   F   |    T
+    #     [x y]             = T |   T   |    T
+    #         [x y]         = T |   T   |    T
+    #            [x y]      = T |   T   |    T
+    #                [x  y] = F |   T   |    F
+    return r1.start < r2.stop and r2.start < r1.stop
+
+
+class Retokenizer(NamedTuple):
+    detokenizer: Detokenizer
+    tokenizer: Tokenizer
+
+    def retokenize(self, tokens:TokenList) -> Tuple[str,TokenList,TokenMapping]:
+        detokenized, old_token_spans = self.detokenizer.detokenize(tokens)
+        new_tokens, new_token_spans = self.tokenizer.tokenize(detokenized)
+
+        old_to_new_mapping = [[] for _ in range(len(old_token_spans))]
+
+        #TODO: This can be done much more efficiently
+        for i, old_token_span in enumerate(old_token_spans):
+            for j, new_token_span in enumerate(new_token_spans):
+                if overlaps(old_token_span, new_token_span):
+                    old_to_new_mapping[i].append(j)
+
+        return detokenized, new_tokens, old_to_new_mapping
+
+
+def make_retokenizer(spec:Dict[str,str]) -> Retokenizer:
+    return Retokenizer(
+        detokenizer=make_detokenizer(spec.get('detokenize', 'spaces')),
+        tokenizer=make_tokenizer(spec.get('tokenize', 'spaces'))
+    )
+
+
+def compute_mapping(src_mapping:TokenMapping, trg_mapping:TokenMapping, alignments:List[Pair]) -> List[Pair]:
+    remapped = set()
+    for old_src_idx, old_trg_idx in alignments:
+        for src_idx in src_mapping[old_src_idx]:
+            for trg_idx in trg_mapping[old_trg_idx]:
+                remapped.add(Pair(src_idx, trg_idx))
+    return sorted(remapped)
+
+
+class RetokenizeModifier(Modifier):
+    src: Retokenizer
+    trg: Retokenizer
+
+    def __init__(self, probability: float=0.0, src:dict=dict(), trg:dict=dict()):
+        super().__init__(probability) # probability is very much ignored lol.
+        self.src = make_retokenizer(src)
+        self.trg = make_retokenizer(trg)
+
+    def __call__(self, line:str) -> str:
+        src, trg, alignments = line.split('\t')
+        src_tokens = src.split()
+        trg_tokens = trg.split()
+        pairs = parse_alignments(alignments, src_tokens, trg_tokens)
+        new_src, new_src_tokens, src_mapping = self.src.retokenize(src_tokens)
+        new_trg, new_trg_tokens, trg_mapping = self.trg.retokenize(trg_tokens)
+        remapped_pairs = compute_mapping(src_mapping, trg_mapping, pairs)
+        return '\t'.join((new_src, new_trg, format_alignments(remapped_pairs)))
+
diff --git a/src/opustrainer/tokenizers.py b/src/opustrainer/tokenizers.py
@@ -0,0 +1,118 @@
+import re
+from typing import Tuple, List, TypeVar, Callable, Union, Dict, Optional
+
+import sacremoses
+from sentencepiece import SentencePieceProcessor
+
+from opustrainer.types import TokenList, Tokenizer, Detokenizer
+
+
+DETOKENIZERS = {
+    'moses': lambda lang: MosesDetokenizer(lang),
+    'spaces': lambda: SpaceDetokenizer(),
+}
+
+TOKENIZERS = {
+    'moses': lambda lang: MosesTokenizer(lang),
+    'spm': lambda vocab: SentencePieceTokenizer(vocab),
+    'spaces': lambda: SpaceTokenizer(),
+}
+
+
+class SpaceTokenizer:
+    def tokenize(self, text:str) -> Tuple[TokenList, List[slice]]:
+        tokens: TokenList = []
+        spans: List[slice] = []
+        for match in re.finditer(r'[^\s]+', text):
+            tokens.append(match.group(0))
+            spans.append(slice(match.start(0), match.end(0)))
+        return tokens, spans
+
+
+class SpaceDetokenizer:
+    def detokenize(self, tokens:TokenList) -> Tuple[str,List[slice]]:
+        spans = []
+        offset = 0
+        for token in tokens:
+            spans.append(slice(offset, offset + len(token)))
+            offset += len(token) + 1 # space
+        return ' '.join(tokens), spans
+
+
+class MosesTokenizer:
+    tokenizer: sacremoses.MosesTokenizer
+
+    def __init__(self, lang:str, custom_nonbreaking_prefixes:Optional[str]=None):
+        self.tokenizer = sacremoses.MosesTokenizer(lang, custom_nonbreaking_prefixes)
+
+    def tokenize(self, text:str) -> Tuple[TokenList, List[slice]]:
+        tokens = self.tokenizer.tokenize(text, escape=False)
+        spans: List[slice] = []
+        offset = 0
+        for token in tokens:
+            offset = text.find(token, offset)
+            if offset == -1:
+                raise RuntimeError(f"Could not find token '{token}' in original text")
+            spans.append(slice(offset, offset + len(token)))
+            offset += len(token)
+        return tokens, spans
+
+
+class MosesDetokenizer:
+    detokenizer: sacremoses.MosesDetokenizer
+
+    def __init__(self, lang:str):
+        self.detokenizer = sacremoses.MosesDetokenizer(lang)
+
+    def detokenize(self, tokens:TokenList) -> Tuple[str,List[slice]]:
+        text = self.detokenizer.detokenize(tokens)
+        spans = []
+        offset = 0
+        for token in tokens:
+            offset = text.find(token, offset)
+            if offset == -1:
+                raise RuntimeError(f"Could not find token '{token}' in detokenized text")
+            spans.append(slice(offset, offset + len(token)))
+            offset += len(token)
+        return text, spans
+
+
+class SentencePieceTokenizer:
+    spm: SentencePieceProcessor
+
+    def __init__(self, vocab:str):
+        self.spm = SentencePieceProcessor(vocab)
+
+    def tokenize(self, text:str) -> Tuple[TokenList,List[slice]]:
+        # interestingly, piece.begin and piece.end are unicode offsets, not byte
+        # offsets as the documentation would suggest. When byte-fallback happens,
+        # there will be pieces where piece.begin and piece.end are the same value
+        # but they are technically necessary to encode the following pieces.
+        # e.g:
+        # > x.encode('🤣', out_type='immutable_proto').pieces
+        #   { piece: "▁" id: 275 surface: "" begin: 0 end: 0 }
+        #   { piece: "<0xF0>" id: 247 surface: "" begin: 0 end: 0 }
+        #   { piece: "<0x9F>" id: 166 surface: "" begin: 0 end: 0 }
+        #   { piece: "<0xA4>" id: 171 surface: "" begin: 0 end: 0 }
+        #   { piece: "<0xA3>" id: 170 surface: "🤣" begin: 0 end: 1 }
+        # > x.decode([247,166,171,170])
+        #   '🤣'
+        spans = [
+            slice(piece.begin, piece.end)
+            for piece in self.spm.encode(text.encode(), out_type='immutable_proto').pieces
+        ]
+        tokens = [text[span] for span in spans]
+        return tokens, spans
+
+
+T = TypeVar('T', bound=Union[Tokenizer,Detokenizer])
+
+def _make(implementations: Dict[str,Callable[...,T]], spec:str) -> T:
+    name, *args = spec.split(':')
+    return implementations[name](*args)
+
+def make_detokenizer(spec:str) -> Detokenizer:
+    return _make(DETOKENIZERS, spec)
+
+def make_tokenizer(spec:str) -> Tokenizer:
+    return _make(TOKENIZERS, spec)
diff --git a/src/opustrainer/trainer.py b/src/opustrainer/trainer.py
@@ -26,6 +26,7 @@
 from opustrainer.modifiers.surface import UpperCaseModifier, TitleCaseModifier
 from opustrainer.modifiers.placeholders import PlaceholderTagModifier
 from opustrainer.modifiers.typos import TypoModifier
+from opustrainer.modifiers.retokenize import RetokenizeModifier
 from opustrainer import logger
 
 def ignore_sigint():
@@ -36,18 +37,15 @@ def ignore_sigint():
     signal.signal(signal.SIGINT, signal.SIG_IGN)
 
 
-# Path to something that can shuffle data. Called with seed, output-path, input-files
-# TODO: Ideally this also deduplicates the src side of the sentence pairs it shuffles ;)
-PATH_TO_SHUFFLE = os.path.dirname(os.path.realpath(__file__)) + "/shuffle.py"
-
 # Available batch modifiers
 # TODO: Import these lazy, on demand?
 MODIFIERS = {
     'UpperCase': UpperCaseModifier,
     'TitleCase': TitleCaseModifier,
     'Tags': PlaceholderTagModifier,
     'Typos': TypoModifier,
-    'Prefix': PrefixModifier
+    'Prefix': PrefixModifier,
+    'Retokenize': RetokenizeModifier,
 }
 
 @dataclass(frozen=True)
@@ -171,7 +169,8 @@ def _open(self):
         # feasible to just write to a named pipe (or even stdout) instead of
         # a temporary file, and let the trainer read directly from that. Not 
         # sure if that has any performance or stability benefits/drawbacks.
-        subprocess.check_call([sys.executable, PATH_TO_SHUFFLE,
+        subprocess.check_call([sys.executable,
+            '-m', 'opustrainer.shuffle',
             *(['--temporary-directory', self.tmpdir] if self.tmpdir else []),
             *([] if self.shuffle else ['--no-shuffle']),
             str(self.seed),
@@ -247,7 +246,8 @@ def _open_async(self, seed:int):
         self._pending = ShuffledFile(
             seed=seed,
             file=cast(TextIO, fh),
-            proc=subprocess.Popen([sys.executable, PATH_TO_SHUFFLE,
+            proc=subprocess.Popen([sys.executable,
+                '-m', 'opustrainer.shuffle',
                 *(['--temporary-directory', self.tmpdir] if self.tmpdir else []),
                 *([] if self.shuffle else ['--no-shuffle']),
                 str(seed),

diff --git a/src/opustrainer/types.py b/src/opustrainer/types.py
@@ -0,0 +1,56 @@
+from abc import ABC, abstractmethod
+from typing import NamedTuple, Dict, List, Tuple, Optional, Any, Protocol
+
+
+TokenList = List[str] # todo: bytes?
+
+TokenMapping = List[List[int]]
+
+
+class Tokenizer(Protocol):
+    """Turns a string into a list of tokens"""
+    def tokenize(self, text:str) -> Tuple[TokenList,List[slice]]:
+        ...
+
+
+class Detokenizer(Protocol):
+    """Turns a list of tokens into a string"""
+    def detokenize(self, tokens:TokenList) -> Tuple[str, List[slice]]:
+        ...
+
+
+class Pair(NamedTuple):
+    """Alignment pair between source and target token indices"""
+    src:int
+    trg:int
+
+
+class SentencePair(NamedTuple):
+    """Semantic representation of a single line from a data source."""
+    src: TokenList
+    trg: TokenList
+
+    # alignments is an empty list if alignment data is available in the dataset
+    # but there are no aligned tokens in this pair. It is None if this dataset
+    # does not have alignment info.
+    alignments: Optional[List[Pair]]
+
+
+class Modifier(ABC):
+    """Line modifier"""
+    probability: float
+
+    def __init__(self, probability:float, **kwargs:Dict[str,Any]):
+        self.probability = probability
+
+    def validate(self, context:List['Modifier']) -> None:
+        """Opportunity for the modifier to see where in the modifier list it is
+        placed and flag any issues to the logger. E.g. if you place a modifier that
+        inserts special tokens before an UpperCase modifier, the latter might
+        modify those special tokens as well. Here you can shout about that.
+        """
+        pass
+
+    @abstractmethod
+    def __call__(self, line: str) -> str:
+        pass