-
Notifications
You must be signed in to change notification settings - Fork 0
/
easyBPE.py
147 lines (114 loc) · 4.22 KB
/
easyBPE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
EXTRACTED AND MODIFIED FROM apply_bpe.py
Use operations learned with learn_bpe.py to encode a new text.
The text will not be smaller, but use only a fixed vocabulary, with rare words
encoded as variable-length sequences of subword units.
Reference:
Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units.
Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
"""
from __future__ import unicode_literals, division
import sys
import os
import inspect
import codecs
import io
import re
import warnings
def applyBPE(self, word):
"""segment a word with BPE encoding"""
output = []
new_word = encode(word,self.bpe_codes,self.bpe_codes_reverse,self.separator,self.version,self.cache)
for item in new_word[:-1]:
output.append(item + self.separator)
output.append(new_word[-1])
return output
def get_pairs(word):
"""Return set of symbol pairs in a word.
word is represented as tuple of symbols (symbols being variable-length strings)
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
def encode(orig, bpe_codes, bpe_codes_reverse, separator, version, cache):
"""Encode word based on list of BPE merge operations, which are applied consecutively
"""
if orig in cache:
return cache[orig]
if version == (0, 1):
word = tuple(orig) + ('</w>',)
elif version == (0, 2): # more consistent handling of word-final segments
word = tuple(orig[:-1]) + ( orig[-1] + '</w>',)
else:
raise NotImplementedError
pairs = get_pairs(word)
if not pairs:
return orig
while True:
bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf')))
if bigram not in bpe_codes:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
# don't print end-of-word symbols
if word[-1] == '</w>':
word = word[:-1]
elif word[-1].endswith('</w>'):
word = word[:-1] + (word[-1].replace('</w>',''),)
cache[orig] = word
return word
class BPE(object):
def __init__(self, codesFile):
merges=-1
separator='@@'
codes= codecs.open(codesFile, encoding='utf-8')
codes.seek(0)
offset=1
# check version information
firstline = codes.readline()
if firstline.startswith('#version:'):
self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")])
offset += 1
else:
self.version = (0, 1)
codes.seek(0)
self.bpe_codes = [tuple(item.strip('\r\n ').split(' ')) for (n, item) in enumerate(codes) if (n < merges or merges == -1)]
for i, item in enumerate(self.bpe_codes):
if len(item) != 2:
sys.stderr.write('Error: invalid line {0} in BPE codes file: {1}\n'.format(i+offset, ' '.join(item)))
sys.stderr.write('The line should exist of exactly two subword units, separated by whitespace\n')
sys.exit(1)
# some hacking to deal with duplicates (only consider first instance)
self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])
self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()])
self.separator = separator
self.cache = {}
# Initialisation
if __name__ == '__main__':
bpe = BPE(codesFile)