diff --git a/converter.py b/converter.py index 9b7eaa8..07dd171 100644 --- a/converter.py +++ b/converter.py @@ -248,32 +248,33 @@ class ChineseConverter(BaseConverter): 中文转换器类。 """ - def __init__(self, data: Ldata, rep: Ldata = rep_zh) -> None: + def __init__(self, data: Ldata, rep: Ldata = rep_zh, auto_cut: bool = True) -> None: """ 初始化转换器。 Args: data (Ldata): 输入的语言数据 rep (Ldata, optional): 替换的格式内容,默认为rep_zh + auto_cut (bool, optional): 是否使用自动分词,默认为True """ super().__init__(data, rep) self.data = data self.rep = rep + self.auto_cut = auto_cut - def segment_str(self, text: str, auto_cut: bool = True) -> List[str]: + def segment_str(self, text: str) -> List[str]: """ 根据设置分词或者直接拆分字符串。 Args: text (str): 需要分割的字符串 - auto_cut (bool, optional): 是否使用自动分词,默认为True Returns: List[str]: 分割后的字符串列表 """ - return jieba.lcut(text) if auto_cut else text.split() + return jieba.lcut(text) if self.auto_cut else text.split() def to_harmonic(self, text: str) -> str: """ diff --git a/fix_data.py b/fix_data.py index c5f601e..5847d79 100644 --- a/fix_data.py +++ b/fix_data.py @@ -4,57 +4,51 @@ from base import load_json from converter import ( save_to_json, - convert, - to_pinyin, - to_mps2, - to_tongyong, - to_yale, - to_wadegiles, - to_romatzyh, - to_cyrillic, - to_xiaojing, + ChineseConverter ) rep = {"!:(": "! :(", ",": ", ", "-!": "!"} fixed_zh_source = load_json("fixed_zh_source") +conv = ChineseConverter(fixed_zh_source, rep, False) + save_to_json( - convert(fixed_zh_source, to_pinyin, auto_cut=False, rep=rep), + conv.convert(conv.to_pinyin), "fixed_zh_py", "data", ) save_to_json( - convert(fixed_zh_source, to_mps2, auto_cut=False, rep=rep), + conv.convert(conv.to_mps2), "fixed_zh_mps2", "data", ) save_to_json( - convert(fixed_zh_source, to_tongyong, auto_cut=False, rep=rep), + conv.convert(conv.to_tongyong), "fixed_zh_ty", "data", ) save_to_json( - convert(fixed_zh_source, to_yale, auto_cut=False, rep=rep), + conv.convert(conv.to_yale), "fixed_zh_yale", "data", ) save_to_json( - convert(fixed_zh_source, to_wadegiles, auto_cut=False, rep=rep), + conv.convert(conv.to_wadegiles), "fixed_zh_wg", "data", ) save_to_json( - convert(fixed_zh_source, to_romatzyh, auto_cut=False, rep=rep), + conv.convert(conv.to_romatzyh), "fixed_zh_gr", "data", ) save_to_json( - convert(fixed_zh_source, to_cyrillic, auto_cut=False, rep=rep), + conv.convert(conv.to_cyrillic), "fixed_zh_cy", "data", ) save_to_json( - convert(fixed_zh_source, to_xiaojing, auto_cut=False, rep=rep), + conv.convert(conv.to_xiaojing), "fixed_zh_xj", "data", )