-
Notifications
You must be signed in to change notification settings - Fork 20
/
correction_basic.py
72 lines (59 loc) · 2.36 KB
/
correction_basic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# @Author:sunshine
# @Time : 2020/5/11 上午9:12
"""
利用语言模型提供的置信度+字音和字形生成的相似度,加权输出一个分数,选取分数最高的候选字符
"""
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
import numpy as np
from tools.char_sim import CharFuncs
config_path = '/home/chenbing/pretrain_models/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/home/chenbing/pretrain_models/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
vocab_path = '/home/chenbing/pretrain_models/bert/chinese_L-12_H-768_A-12/vocab.txt'
topk = 3
class OurTokenizer(Tokenizer):
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]')
else:
R.append('[UNK]')
return R
tokenizer = OurTokenizer(vocab_path)
model = build_transformer_model(
config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True
) # 建立模型,加载权重
model.summary()
C = CharFuncs('data/char_meta.txt')
def text_correction(text):
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.tokens_to_ids(tokens)
segment_ids = [0] * len(token_ids)
probs = model.predict([[token_ids], [segment_ids]])[0][1:-1]
topk_probs_index = np.argsort(-probs, axis=1)[:, :topk]
true_chars = ''
for candidate_probs, candidate_probs_index, char in zip(probs, topk_probs_index, tokens[1:-1]):
candidate = tokenizer.decode(candidate_probs_index)
if candidate[0] != char:
scores = []
candidate_prob_topk = candidate_probs[candidate_probs_index]
for c, b in zip(candidate, candidate_prob_topk):
sim = C.similarity(char, c, weights=(0.8, 0.2, 0.0))
score = 0.6 * b + 0.4 * sim
scores.append((score, char))
if scores:
sort_score = sorted(scores, key=lambda x: x[0], reverse=True)
true_chars += sort_score[0][1]
else:
true_chars += char
else:
true_chars += char
return true_chars
if __name__ == '__main__':
text = '专家公步虎门大桥涡振原因'
result = text_correction(text)
print(result)
# print(len(text), len(result))