diff --git a/tools/schemagen.py b/tools/schemagen.py index 975e21b..ced53af 100644 --- a/tools/schemagen.py +++ b/tools/schemagen.py @@ -2,7 +2,7 @@ # schemagen.py -- 雙拼+輔助碼 Rime 方案生成工具 # -# Copyright (c) 2023 ksqsf +# Copyright (c) 2023-2024 ksqsf # # License: GPLv3, with the exception that the copyright of any # generated output belongs to the user. (所生成碼表本身的著作權歸本程 @@ -217,15 +217,22 @@ def initialize_charset(): charset.append(line[0]) -def encode_fixed_word(word, pinyin=None): +def encode_fixed_word(word, pinyin=None, short=False): assert len(word) > 1 + if ',' in word: + word = word.replace(',', '') if not pinyin: pinyin = word_to_pinyin(word) double_pinyin = to_double_pinyin(pinyin).split() if len(word) == 2: - return ''.join(double_pinyin) + if not short: + return ''.join(double_pinyin) + else: + return double_pinyin[0][0] + double_pinyin[1][0] elif len(word) == 3: - if args.aabc: + if short: + return double_pinyin[0][0] + double_pinyin[1][0] + double_pinyin[2][0] + elif args.aabc: return double_pinyin[0] + double_pinyin[1][0] + double_pinyin[2][0] else: return double_pinyin[0][0] + double_pinyin[1][0] + double_pinyin[2] @@ -252,12 +259,10 @@ def handle_gen_fixed(): table = defaultdict(list) encoded = defaultdict(list) - def put_into_dict(word, code, max_len=4): + + def put_into_dict_char(word, code, py, max_len=4): nonlocal table, encoded - # 詞總是使用四碼 - if len(word) > 1: - table[code].append(word) - return + assert len(word) == 1 # 這個字有多個編碼,但如果某個已有編碼是當前編碼的前綴,則不再添加該額外編碼 for existing_code in encoded[word]: if code.startswith(existing_code): @@ -272,6 +277,47 @@ def put_into_dict(word, code, max_len=4): # 一簡到三簡已經全部用完 table[code].append(word) + def put_into_dict_word(word, code, pinyin, max_len=4): + nonlocal table + assert len(word) > 1 + + # 不使用簡詞時,詞總是四碼 + if not args.short_word: + table[code].append(word) + return + + # 使用簡詞時,詞語嘗試多種編碼方式 + # - 一簡 + # - n字詞嘗試n簡 + # - 二字詞嘗試取前三碼 + # - fallback: 全碼 + short_codes = [] # all(c < 4 for c in short_codes) + # 1. 一簡 + short_codes.append(code[0]) + # 2. 字數匹配的簡碼 + short_code = encode_fixed_word(word, pinyin, True) + if len(short_code) < 4: + short_codes.append(short_code) + # 3. 前三碼 + if len(word) == 2: + short_codes.append(code[:3]) + + # 放入簡碼 + tolerance = dict(zip([1,2,3], (int(s) for s in args.tolerance.split(',')))) + for c in short_codes: + if len(table[c]) < tolerance[len(c)]: + table[c].append(word) + return + + # 沒放進去,只能放到全碼位上 + table[code].append(word) + + def put_into_dict(word, code, py, max_len=4): + if len(word) == 1: + put_into_dict_char(word, code, py, max_len) + else: + put_into_dict_word(word, code, py, max_len) + # 放入單字 words = [] for c in charset: @@ -279,7 +325,7 @@ def put_into_dict(word, code, max_len=4): for ac in to_auxiliary_codes(c): try: w = pinyin_weight(c, py) - words.append((w, c, to_double_pinyin(py)+ac)) + words.append((w, c, to_double_pinyin(py)+ac, py)) except: traceback.print_exc() @@ -287,16 +333,20 @@ def put_into_dict(word, code, max_len=4): for (word, pinyin, weight) in read_input_dict(): if len(word) > 1: try: - words.append((pinyin_weight(word, pinyin), word, encode_fixed_word(word, pinyin))) - # for code in encode_fixed_word_sunshine_strategy(word, pinyin): - # words.append((pinyin_weight(word, pinyin), word, code)) + code = encode_fixed_word(word, pinyin, False) + assert len(code) == 4 + words.append((pinyin_weight(word, pinyin), + word, + code, + pinyin)) except: traceback.print_exc() + pass # 降序將所有字詞放入碼表 words.sort(key=itemgetter(0), reverse=True) - for (w, word, code) in words: - put_into_dict(word, code) + for (w, word, code, py) in words: + put_into_dict(word, code, py) # 輸出碼表 print_table(table) @@ -432,6 +482,7 @@ def handle_update_char_weight(): gen_fixed.add_argument('--format', choices=['code-words', 'code-word', 'word-code', 'word-codes'], help='輸出碼表的格式', default='code-words') gen_fixed.add_argument('--tolerance', help='每級簡碼最多可以容納多少候選', default='1,1,1') gen_fixed.add_argument('--aabc', action='store_true', default=False, help='三碼字使用 AABC 方式編碼') +gen_fixed.add_argument('--short-word', action='store_true', help='生成簡詞', default=False) update_compact_dict = subparsers.add_parser('update-compact-dict', help='更新 *compact* 詞庫中的輔助碼爲新輔助碼') update_compact_dict.add_argument('--rime-dict', help='輸入rime格式詞庫(無frontmatter)', required=True) @@ -439,6 +490,8 @@ def handle_update_char_weight(): update_char_weight = subparsers.add_parser('update-char-weight', help='更新 chars 詞庫中的詞頻') update_char_weight.add_argument('--rime-dict', help='輸入rime格式詞庫', required=True) +args = parser.parse_args(['gen-fixed']) + if __name__ == '__main__': args = parser.parse_args() if args.command == 'gen-chars':