-
Notifications
You must be signed in to change notification settings - Fork 16
/
utils.py
124 lines (103 loc) · 3.99 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os, json, shutil, logging
import tensorflow as tf
from conlleval import return_report
model_path = "./models"
eval_path = "./evaluation"
eval_temp = os.path.join(eval_path, "temp")
eval_cript = os.path.join(eval_path, "conlleval")
def test_ner(results, path):
output_file = os.path.join(path, "predict.utf8")
with open(output_file, "w", encoding='utf-8') as f:
to_write = []
for block in results:
for line in block:
to_write.append(line + "\n")
f.writelines(to_write)
eval_lines = return_report(output_file)
return eval_lines
def make_path(params):
if not os.path.isdir(params.result_path):
os.makedirs(params.result_path)
if not os.path.isdir(params.ckpt_path):
os.makedirs(params.ckpt_path)
if not os.path.isdir("log"):
os.makedirs("log")
def load_config(config_file):
with open(config_file, encoding="utf-8") as f:
return json.load(f)
def save_config(config, config_file):
with open(config_file, "w", encoding="utf-8") as f:
json.dump(config, f , ensure_ascii=False, indent=4)
def get_logger(log_file):
logger = logging.getLogger(log_file)
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler(log_file)
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
fh.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)
return logger
def save_model(sess, model, path, logger):
checkpoint_path = os.path.join(path, "ner.skpt")
model.saver.save(sess, checkpoint_path)
logger.info("model saved")
def create_model(session, Moeld_class, path, load_vec, config, id_to_char, logger):
model = Moeld_class(config)
ckpt = tf.train.get_checkpoint_state(path)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path)
model.saver.restore(session, ckpt.model_checkpoint_path)
else:
logger.info("Created model with fresh parameters.")
session.run(tf.global_variables_initializer())
if config["pre_emb"]:
emb_weights = session.run(model.char_lookup.read_value())
emb_weights = load_vec(config["emb_file"], id_to_char, config["char_dim"], emb_weights)
session.run(model.char_lookup.assign(emb_weights))
logger.info("Load pre-trained embedding.")
return model
def result_to_json(string, tags):
item = {"string": string, "entities": []}
entity_name = ""
entity_start = 0
idx = 0
for char, tag in zip(string, tags):
if tag[0] == "S":
item["entities"].append({"word": char, "start": idx, "end": idx+1, "type":tag[2:]})
elif tag[0] == "B":
entity_name += char
entity_start = idx
elif tag[0] == "I":
entity_name += char
elif tag[0] == "E":
entity_name += char
item["entities"].append({"word": entity_name, "start": entity_start, "end": idx + 1, "type": tag[2:]})
entity_name = ""
else:
entity_name = ""
entity_start = idx
idx += 1
return item
def clean(params):
if os.path.isfile(params.vocab_file):
os.remove(params.vocab_file)
if os.path.isfile(params.map_file):
os.remove(params.map_file)
if os.path.isdir(params.ckpt_path):
shutil.rmtree(params.ckpt_path)
if os.path.isdir(params.summary_path):
shutil.rmtree(params.summary_path)
if os.path.isdir(params.result_path):
shutil.rmtree(params.result_path)
if os.path.isdir("log"):
shutil.rmtree("log")
if os.path.isdir("__pycache__"):
shutil.rmtree("__pycache__")
if os.path.isfile(params.config_file):
os.remove(params.config_file)
if os.path.isfile(params.vocab_file):
os.remove(params.vocab_file)