Skip to content

Commit

Permalink
tools: 增加簡詞計算
Browse files Browse the repository at this point in the history
  • Loading branch information
ksqsf committed Jan 3, 2024
1 parent 055e3c9 commit 699ff91
Showing 1 changed file with 68 additions and 15 deletions.
83 changes: 68 additions & 15 deletions tools/schemagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# schemagen.py -- 雙拼+輔助碼 Rime 方案生成工具
#
# Copyright (c) 2023 ksqsf
# Copyright (c) 2023-2024 ksqsf
#
# License: GPLv3, with the exception that the copyright of any
# generated output belongs to the user. (所生成碼表本身的著作權歸本程
Expand Down Expand Up @@ -217,15 +217,22 @@ def initialize_charset():
charset.append(line[0])


def encode_fixed_word(word, pinyin=None):
def encode_fixed_word(word, pinyin=None, short=False):
assert len(word) > 1
if ',' in word:
word = word.replace(',', '')
if not pinyin:
pinyin = word_to_pinyin(word)
double_pinyin = to_double_pinyin(pinyin).split()
if len(word) == 2:
return ''.join(double_pinyin)
if not short:
return ''.join(double_pinyin)
else:
return double_pinyin[0][0] + double_pinyin[1][0]
elif len(word) == 3:
if args.aabc:
if short:
return double_pinyin[0][0] + double_pinyin[1][0] + double_pinyin[2][0]
elif args.aabc:
return double_pinyin[0] + double_pinyin[1][0] + double_pinyin[2][0]
else:
return double_pinyin[0][0] + double_pinyin[1][0] + double_pinyin[2]
Expand All @@ -252,12 +259,10 @@ def handle_gen_fixed():

table = defaultdict(list)
encoded = defaultdict(list)
def put_into_dict(word, code, max_len=4):

def put_into_dict_char(word, code, py, max_len=4):
nonlocal table, encoded
# 詞總是使用四碼
if len(word) > 1:
table[code].append(word)
return
assert len(word) == 1
# 這個字有多個編碼,但如果某個已有編碼是當前編碼的前綴,則不再添加該額外編碼
for existing_code in encoded[word]:
if code.startswith(existing_code):
Expand All @@ -272,31 +277,76 @@ def put_into_dict(word, code, max_len=4):
# 一簡到三簡已經全部用完
table[code].append(word)

def put_into_dict_word(word, code, pinyin, max_len=4):
nonlocal table
assert len(word) > 1

# 不使用簡詞時,詞總是四碼
if not args.short_word:
table[code].append(word)
return

# 使用簡詞時,詞語嘗試多種編碼方式
# - 一簡
# - n字詞嘗試n簡
# - 二字詞嘗試取前三碼
# - fallback: 全碼
short_codes = [] # all(c < 4 for c in short_codes)
# 1. 一簡
short_codes.append(code[0])
# 2. 字數匹配的簡碼
short_code = encode_fixed_word(word, pinyin, True)
if len(short_code) < 4:
short_codes.append(short_code)
# 3. 前三碼
if len(word) == 2:
short_codes.append(code[:3])

# 放入簡碼
tolerance = dict(zip([1,2,3], (int(s) for s in args.tolerance.split(','))))
for c in short_codes:
if len(table[c]) < tolerance[len(c)]:
table[c].append(word)
return

# 沒放進去,只能放到全碼位上
table[code].append(word)

def put_into_dict(word, code, py, max_len=4):
if len(word) == 1:
put_into_dict_char(word, code, py, max_len)
else:
put_into_dict_word(word, code, py, max_len)

# 放入單字
words = []
for c in charset:
for py in pinyin_table[c].keys():
for ac in to_auxiliary_codes(c):
try:
w = pinyin_weight(c, py)
words.append((w, c, to_double_pinyin(py)+ac))
words.append((w, c, to_double_pinyin(py)+ac, py))
except:
traceback.print_exc()

# 再放入詞語
for (word, pinyin, weight) in read_input_dict():
if len(word) > 1:
try:
words.append((pinyin_weight(word, pinyin), word, encode_fixed_word(word, pinyin)))
# for code in encode_fixed_word_sunshine_strategy(word, pinyin):
# words.append((pinyin_weight(word, pinyin), word, code))
code = encode_fixed_word(word, pinyin, False)
assert len(code) == 4
words.append((pinyin_weight(word, pinyin),
word,
code,
pinyin))
except:
traceback.print_exc()
pass

# 降序將所有字詞放入碼表
words.sort(key=itemgetter(0), reverse=True)
for (w, word, code) in words:
put_into_dict(word, code)
for (w, word, code, py) in words:
put_into_dict(word, code, py)

# 輸出碼表
print_table(table)
Expand Down Expand Up @@ -432,13 +482,16 @@ def handle_update_char_weight():
gen_fixed.add_argument('--format', choices=['code-words', 'code-word', 'word-code', 'word-codes'], help='輸出碼表的格式', default='code-words')
gen_fixed.add_argument('--tolerance', help='每級簡碼最多可以容納多少候選', default='1,1,1')
gen_fixed.add_argument('--aabc', action='store_true', default=False, help='三碼字使用 AABC 方式編碼')
gen_fixed.add_argument('--short-word', action='store_true', help='生成簡詞', default=False)

update_compact_dict = subparsers.add_parser('update-compact-dict', help='更新 *compact* 詞庫中的輔助碼爲新輔助碼')
update_compact_dict.add_argument('--rime-dict', help='輸入rime格式詞庫(無frontmatter)', required=True)

update_char_weight = subparsers.add_parser('update-char-weight', help='更新 chars 詞庫中的詞頻')
update_char_weight.add_argument('--rime-dict', help='輸入rime格式詞庫', required=True)

args = parser.parse_args(['gen-fixed'])

if __name__ == '__main__':
args = parser.parse_args()
if args.command == 'gen-chars':
Expand Down

0 comments on commit 699ff91

Please sign in to comment.