tools: 增加簡詞計算

rimeinn · Jan 3, 2024 · 699ff91 · 699ff91
1 parent 055e3c9
commit 699ff91
Showing 1 changed file with 68 additions and 15 deletions.
diff --git a/tools/schemagen.py b/tools/schemagen.py
@@ -2,7 +2,7 @@
 
 # schemagen.py -- 雙拼+輔助碼 Rime 方案生成工具
 #
-# Copyright (c) 2023 ksqsf
+# Copyright (c) 2023-2024 ksqsf
 #
 # License: GPLv3, with the exception that the copyright of any
 # generated output belongs to the user. (所生成碼表本身的著作權歸本程
@@ -217,15 +217,22 @@ def initialize_charset():
             charset.append(line[0])
 
 
-def encode_fixed_word(word, pinyin=None):
+def encode_fixed_word(word, pinyin=None, short=False):
     assert len(word) > 1
+    if '，' in word:
+        word = word.replace('，', '')
     if not pinyin:
         pinyin = word_to_pinyin(word)
     double_pinyin = to_double_pinyin(pinyin).split()
     if len(word) == 2:
-        return ''.join(double_pinyin)
+        if not short:
+            return ''.join(double_pinyin)
+        else:
+            return double_pinyin[0][0] + double_pinyin[1][0]
     elif len(word) == 3:
-        if args.aabc:
+        if short:
+            return double_pinyin[0][0] + double_pinyin[1][0] + double_pinyin[2][0]
+        elif args.aabc:
             return double_pinyin[0] + double_pinyin[1][0] + double_pinyin[2][0]
         else:
             return double_pinyin[0][0] + double_pinyin[1][0] + double_pinyin[2]
@@ -252,12 +259,10 @@ def handle_gen_fixed():
 
     table = defaultdict(list)
     encoded = defaultdict(list)
-    def put_into_dict(word, code, max_len=4):
+
+    def put_into_dict_char(word, code, py, max_len=4):
         nonlocal table, encoded
-        # 詞總是使用四碼
-        if len(word) > 1:
-            table[code].append(word)
-            return
+        assert len(word) == 1
         # 這個字有多個編碼，但如果某個已有編碼是當前編碼的前綴，則不再添加該額外編碼
         for existing_code in encoded[word]:
             if code.startswith(existing_code):
@@ -272,31 +277,76 @@ def put_into_dict(word, code, max_len=4):
         # 一簡到三簡已經全部用完
         table[code].append(word)
 
+    def put_into_dict_word(word, code, pinyin, max_len=4):
+        nonlocal table
+        assert len(word) > 1
+
+        # 不使用簡詞時，詞總是四碼
+        if not args.short_word:
+            table[code].append(word)
+            return
+
+        # 使用簡詞時，詞語嘗試多種編碼方式
+        # - 一簡
+        # - n字詞嘗試n簡
+        # - 二字詞嘗試取前三碼
+        # - fallback: 全碼
+        short_codes = []   # all(c < 4 for c in short_codes)
+        # 1. 一簡
+        short_codes.append(code[0])
+        # 2. 字數匹配的簡碼
+        short_code = encode_fixed_word(word, pinyin, True)
+        if len(short_code) < 4:
+            short_codes.append(short_code)
+        # 3. 前三碼
+        if len(word) == 2:
+            short_codes.append(code[:3])
+
+        # 放入簡碼
+        tolerance = dict(zip([1,2,3], (int(s) for s in args.tolerance.split(','))))
+        for c in short_codes:
+            if len(table[c]) < tolerance[len(c)]:
+                table[c].append(word)
+                return
+
+        # 沒放進去，只能放到全碼位上
+        table[code].append(word)
+
+    def put_into_dict(word, code, py, max_len=4):
+        if len(word) == 1:
+            put_into_dict_char(word, code, py, max_len)
+        else:
+            put_into_dict_word(word, code, py, max_len)        
+
     # 放入單字
     words = []
     for c in charset:
         for py in pinyin_table[c].keys():
             for ac in to_auxiliary_codes(c):
                 try:
                     w = pinyin_weight(c, py)
-                    words.append((w, c, to_double_pinyin(py)+ac))
+                    words.append((w, c, to_double_pinyin(py)+ac, py))
                 except:
                     traceback.print_exc()
 
     # 再放入詞語
     for (word, pinyin, weight) in read_input_dict():
         if len(word) > 1:
             try:
-                words.append((pinyin_weight(word, pinyin), word, encode_fixed_word(word, pinyin)))
-                # for code in encode_fixed_word_sunshine_strategy(word, pinyin):
-                #     words.append((pinyin_weight(word, pinyin), word, code))
+                code = encode_fixed_word(word, pinyin, False)
+                assert len(code) == 4
+                words.append((pinyin_weight(word, pinyin),
+                              word,
+                              code,
+                              pinyin))
             except:
                 traceback.print_exc()
+                pass
 
     # 降序將所有字詞放入碼表
     words.sort(key=itemgetter(0), reverse=True)
-    for (w, word, code) in words:
-        put_into_dict(word, code)
+    for (w, word, code, py) in words:
+        put_into_dict(word, code, py)
 
     # 輸出碼表
     print_table(table)
@@ -432,13 +482,16 @@ def handle_update_char_weight():
 gen_fixed.add_argument('--format', choices=['code-words', 'code-word', 'word-code', 'word-codes'], help='輸出碼表的格式', default='code-words')
 gen_fixed.add_argument('--tolerance', help='每級簡碼最多可以容納多少候選', default='1,1,1')
 gen_fixed.add_argument('--aabc', action='store_true', default=False, help='三碼字使用 AABC 方式編碼')
+gen_fixed.add_argument('--short-word', action='store_true', help='生成簡詞', default=False)
 
 update_compact_dict = subparsers.add_parser('update-compact-dict', help='更新 *compact* 詞庫中的輔助碼爲新輔助碼')
 update_compact_dict.add_argument('--rime-dict', help='輸入rime格式詞庫（無frontmatter）', required=True)
 
 update_char_weight = subparsers.add_parser('update-char-weight', help='更新 chars 詞庫中的詞頻')
 update_char_weight.add_argument('--rime-dict', help='輸入rime格式詞庫', required=True)
 
+args = parser.parse_args(['gen-fixed'])
+
 if __name__ == '__main__':
     args = parser.parse_args()
     if args.command == 'gen-chars':