-
Notifications
You must be signed in to change notification settings - Fork 3
/
cadet.py
178 lines (141 loc) · 5.91 KB
/
cadet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import spacy
from pathlib import Path
def create_model(language, model_path):
pipeline = ["tagger", "parser", "ner", "sentencizer", "entity_linker"]
cls = spacy.util.get_lang_class(language) # 1. Get Language instance, e.g. English()
nlp = cls() # 2. Initialize it
for name in pipeline:
component = nlp.create_pipe(name) # 3. Create the pipeline components
nlp.add_pipe(component) # 4. Add the component to the pipeline
nlp.to_disk(model_path)
assert spacy.load(model_path)
def create_stop_words(path):
path.mkdir(parents=True, exist_ok=True)
path = path / 'stop_words.py'
with path.open('w', encoding="utf-8") as f:
f.write(
f'''
# coding: utf8
from __future__ import unicode_literals
STOP_WORDS = set(
"""
""".split()
)
''')
blank_examples="""
# coding: utf8
from __future__ import unicode_literals
sentences = []
"""
def create_examples(path):
examples = path / 'examples.py'
with examples.open('w', encoding="utf-8") as f:
f.write(blank_examples)
def add_entrypoint(lang_id, path):
entry_point = path / 'setup.py'
with entry_point.open('w', encoding="utf-8") as f:
f.write(f'''from setuptools import setup
setup(
name="{lang_id}",
entry_points={{
"spacy_languages": ["{lang_id} = {lang_id}:{lang_id.capitalize()}"],
}}
)
''')
def create_spacy_language(language:str):
path = Path('lang/' + language)
path.mkdir(parents=False, exist_ok=True)
spacy_path = Path(spacy.__file__.replace('__init__.py',''))
spacy_lang = spacy_path / 'lang' / language
spacy_lang.symlink_to(path)
init = path / '__init__.py'
with init.open('w', encoding="utf-8") as f:
f.write(
f"""
from spacy.lang.{language}.stop_words import STOP_WORDS
# These files are part of spaCy and do not need to be edited
from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS
from spacy.lang.norm_exceptions import BASE_NORMS
from spacy.language import Language
from spacy.attrs import LANG, NORM
from spacy.util import update_exc, add_lookups
# https://spacy.io/usage/adding-languages#language-subclass
class {language.capitalize()}Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "{language}"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS,
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS,)
stop_words = STOP_WORDS
class {language.capitalize()}(Language):
lang = "{language}"
Defaults = {language.capitalize()}Defaults
__all__ = ["{language.capitalize()}"]
""")
def clone_spacy_language(language, clone, model=None):
language = slugify(language).replace('-','_')
new_path = Path(settings.CUSTOM_LANGUAGES_DIRECTORY + '/lang/' + language)
new_path.mkdir(parents=True, exist_ok=False)
# Create symlink between spacy/lang and the new custom languages directory
lang_path = Path(spacy.__file__.replace('__init__.py','')) / 'lang' / language
lang_path.symlink_to(new_path)
if clone.is_core:
core_path = Path(spacy.__file__.replace('__init__.py','')) / 'lang' / clone.iso
assert core_path.exists()
else:
core_path = Path(settings.CUSTOM_LANGUAGES_DIRECTORY + '/lang/' + clone.language)
assert core_path.exists()
# copy files from core to custom
core_files = [x for x in core_path.glob('**/*') if x.is_file() and 'pyc' not in str(x)]
for src in core_files:
dest = new_path / src.name
copyfile(src, dest)
# Edit imports and variable names
language_name = spacy.util.get_lang_class(clone.iso).__name__
init = new_path / '__init__.py'
init_text = init.read_text()
init_text = init_text.replace(language_name, language.capitalize()).replace('"'+clone.iso+'"','"'+language+'"') # quotes added to avoid false matches
init_text = init_text.replace('from ...', 'from spacy.')
init_text = init_text.replace('from ..', 'from spacy.lang.')
init_text = init_text.replace('from .', 'from spacy.lang.' + language + '.')
init.write_text(init_text)
new_files = [x for x in new_path.glob('**/*') if x.is_file() and x.name != '__init__.py']
for file in new_files:
file_text = file.read_text()
file_text = file_text.replace('from ...', 'from spacy.')
file_text = file_text.replace('from ..', 'from spacy.lang.')
file_text = file_text.replace('from .', 'from spacy.lang.' + language + '.')
file.write_text(file_text)
#spacy lookups ~ using pip install spacy[lookups]
import spacy_lookups_data
spacy_lookups = Path(spacy_lookups_data.__file__.replace('__init__.py','')) / 'data'
assert spacy_lookups.exists()
# Use the iso code to identify lookups-data files
new_lookups = Path(settings.CUSTOM_LANGUAGES_DIRECTORY + '/lookups-data/')
lookups_files = [x for x in spacy_lookups.glob('**/*') if x.is_file() and clone.iso+'_' in str(x)]
for src in lookups_files:
new_name = src.name.replace(clone.iso, language)
dest = new_lookups / new_name
copyfile(src, dest)
#Create or clone a spaCy model using the language object
model_path = Path(settings.CUSTOM_LANGUAGES_DIRECTORY + '/models/' + language)
model_path.mkdir(parents=True, exist_ok=False)
if model:
try:
spacy.load(model)
except:
download(model)
clone_model(language, model_path, model)
if not model:
create_model(language, model_path)
if __name__ == "__main__":
lang_path = Path('lang/sr1')
model_path = 'models/sr1'
# model_path = Path(settings.CUSTOM_LANGUAGES_DIRECTORY + '/models/' + language)
# model_path.mkdir(parents=True, exist_ok=False)
add_entrypoint('sr1', lang_path)
# create_stop_words(lang_path)
# create_examples(lang_path)
# create_spacy_language('sr1')
# create_model(language, model_path)