-
Notifications
You must be signed in to change notification settings - Fork 1
/
process_vocab.py
95 lines (82 loc) · 3.41 KB
/
process_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from collections import Counter
import itertools
import json
from gensim.models import word2vec, KeyedVectors
from tqdm import tqdm
import numpy as np
import re
def clean_data(data):
data = re.sub('\n', '', data)
data = re.sub('\s+', '', data)
data = re.sub('(\d+)', '', data)
data = re.sub(r'\\n', '', data)
data = re.sub('①|②|③|④|●|◆', '', data)
return data
def read_data(path):
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
texts = []
questions = []
answers = []
for line in data:
for annotation in line['annotations']:
texts.append(list(clean_data(line['text'])))
questions.append(list(annotation['Q']))
answers.append(list(clean_data(annotation['A'])))
return texts, questions, answers
def extract_vocab(iterable, min_word_freq=3):
""" Turns an iterable of list of tokens into a vocabulary.
These tokens could be single answers or word tokens in questions.
"""
all_tokens = itertools.chain.from_iterable(iterable)
counter = Counter(all_tokens)
word2idx = dict()
word2idx['<pad>'] = 0
word2idx['<start>'] = 1
word2idx['<end>'] = 2
word2idx['<unk>'] = 3
word2idx["[CLS]"] = 4
word2idx["[SEP]"] = 5
idx = 6
for word, value in counter.items():
if value >= min_word_freq:
word2idx[word] = idx
idx += 1
print('vocab size : {}'.format(len(word2idx)))
return word2idx
print('read_data')
all_texts, all_questions, all_answers = read_data('../data/round1_train_0907.json')
test_1_texts, _, test_1_answers = read_data('../data/round1_test_0907.json')
test_2_texts, _, test_2_answers = read_data('../data/juesai_1011.json')
print('extract_vocab')
words = all_texts + all_questions + all_answers + test_1_texts + test_1_answers + test_2_texts + test_2_answers
vocab = extract_vocab(words, min_word_freq=5)
w2v_model = KeyedVectors.load_word2vec_format('../user_data/word2vec/Tencent_AILab_ChineseEmbedding.txt', binary=False)
print(len(w2v_model.vocab))
word2vec_dict = {}
embedding_max_value = 0
embedding_min_value = 1
count = 0
for word in tqdm(vocab.keys()):
if word in w2v_model:
word2vec_dict[word] = w2v_model[word]
embedding_matrix = np.array(w2v_model[word])
embedding_max_value = max(np.max(embedding_matrix), embedding_max_value)
embedding_min_value = min(np.min(embedding_matrix), embedding_min_value)
count += 1
else:
# word2vec_dict[word] = np.random.normal(0., 1., size=300)
word2vec_dict[word] = np.zeros(300)
# word2vec_dict['<unk>'] = np.random.uniform(low=embedding_min_value, high=embedding_max_value, size=300)
# word2vec_dict['<end>'] = np.random.uniform(low=embedding_min_value, high=embedding_max_value, size=300)
# word2vec_dict['<start>'] = np.random.uniform(low=embedding_min_value, high=embedding_max_value, size=300)
# word2vec_dict['<pad>'] = np.random.uniform(low=embedding_min_value, high=embedding_max_value, size=300)
print(count)
print(len(word2vec_dict))
with open('../user_data/word2vec/char_embedding.bin', 'w') as f:
f.write(str(len(word2vec_dict)-1) + ' 300\n')
words = list(word2vec_dict.keys())
for w in tqdm(words):
if w != ' ':
v = list(map(str, word2vec_dict[w]))
f.write(w + ' ' + ' '.join(v) + '\n')