-
Notifications
You must be signed in to change notification settings - Fork 1
/
pre_build.py
119 lines (102 loc) · 4.24 KB
/
pre_build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import re
import os
import shutil
rxDiacritics = re.compile('[ӧё]')
rxDiaPartsStem = re.compile('( stem:)( *[^\r\n]+)')
rxDiaPartsFlex = re.compile('(-flex:)( *[^\r\n]+)')
rxStemVariants = re.compile('[^ |/]+')
rxFlexVariants = re.compile('[^ /]+')
dictDiacritics = {
'ӧ': 'о',
'ё': 'е'
}
def collect_lemmata(dirName):
lemmata = ''
lexrules = ''
for fname in os.listdir(dirName):
if fname.endswith('.txt') and fname.startswith('kpv_lexemes_'):
f = open(os.path.join(dirName, fname), 'r', encoding='utf-8-sig')
lemmata += f.read() + '\n'
f.close()
elif fname.endswith('.txt') and fname.startswith('kpv_lexrules'):
f = open(os.path.join(dirName, fname), 'r', encoding='utf-8-sig')
lexrules += f.read() + '\n'
f.close()
lemmataSet = set(re.findall('-lexeme\n(?: [^\r\n]*\n)+', lemmata, flags=re.DOTALL))
lemmata = '\n'.join(sorted(list(lemmataSet)))
return lemmata, lexrules
def add_diacriticless(morph):
"""
Add a diacriticless variant to a stem or an inflection
"""
morph = morph.group(0)
if rxDiacritics.search(morph) is None:
return morph
return morph + '//' + rxDiacritics.sub(lambda m: dictDiacritics[m.group(0)], morph)
def process_diacritics_stem(line):
"""
Remove diacritics from one line that contains stems.
"""
morphCorrected = rxStemVariants.sub(add_diacriticless, line.group(2))
return line.group(1) + morphCorrected
def process_diacritics_flex(line):
"""
Remove diacritics from one line that contains inflextions.
"""
morphCorrected = rxFlexVariants.sub(add_diacriticless, line.group(2))
return line.group(1) + morphCorrected
def russify(text):
"""
Add diacriticless variants for stems and inflections.
"""
text = rxDiaPartsStem.sub(process_diacritics_stem, text)
text = rxDiaPartsFlex.sub(process_diacritics_flex, text)
return text
def prepare_files():
"""
Put all the lemmata to lexemes.txt. Put all the lexical
rules to lexical_rules.txt. Create separate versions of
relevant files for diacriticless texts.
Put all grammar files to ../uniparser_komi_zyrian/data_strict/
(original version) or ../uniparser_komi_zyrian/data_nodiacritics/
(diacriticless version).
"""
lemmata, lexrules = collect_lemmata('.')
fOutLemmata = open('uniparser_komi_zyrian/data_strict/lexemes.txt', 'w', encoding='utf-8')
fOutLemmata.write(lemmata)
fOutLemmata.close()
fOutLemmataRus = open('uniparser_komi_zyrian/data_nodiacritics/lexemes.txt', 'w', encoding='utf-8')
fOutLemmataRus.write(russify(lemmata))
fOutLemmataRus.close()
fInParadigms = open('paradigms.txt', 'r', encoding='utf-8-sig')
paradigms = fInParadigms.read()
fInParadigms.close()
fOutParadigms = open('uniparser_komi_zyrian/data_strict/paradigms.txt', 'w', encoding='utf-8')
fOutParadigms.write(paradigms)
fOutParadigms.close()
fOutParadigmsRus = open('uniparser_komi_zyrian/data_nodiacritics/paradigms.txt', 'w', encoding='utf-8')
fOutParadigmsRus.write(russify(paradigms))
fOutParadigmsRus.close()
fOutLexrules = open('uniparser_komi_zyrian/data_strict/lex_rules.txt', 'w', encoding='utf-8')
fOutLexrules.write(lexrules)
fOutLexrules.close()
fOutLexrules = open('uniparser_komi_zyrian/data_nodiacritics/lex_rules.txt', 'w', encoding='utf-8')
fOutLexrules.write(lexrules)
fOutLexrules.close()
shutil.copy2('bad_analyses.txt', 'uniparser_komi_zyrian/data_strict/')
shutil.copy2('bad_analyses.txt', 'uniparser_komi_zyrian/data_nodiacritics/')
# shutil.copy2('komi_zyrian_disambiguation.cg3', 'uniparser_komi_zyrian/data_strict/')
# shutil.copy2('komi_zyrian_disambiguation.cg3', 'uniparser_komi_zyrian/data_nodiacritics/')
def parse_wordlists():
"""
Analyze wordlists/wordlist.csv.
"""
from uniparser_komi_zyrian import KomiZyrianAnalyzer
a = KomiZyrianAnalyzer(mode='strict')
a.analyze_wordlist(freqListFile='wordlists/wordlist_main.csv',
parsedFile='wordlists/wordlist_analyzed.txt',
unparsedFile='wordlists/wordlist_unanalyzed.txt',
verbose=True)
if __name__ == '__main__':
prepare_files()
parse_wordlists()