-
Notifications
You must be signed in to change notification settings - Fork 51
/
data.py
140 lines (105 loc) · 4.65 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import torch
from torch.autograd import Variable
from collections import Counter
import pickle
import random
import os
USE_CUDA = torch.cuda.is_available()
def prepare_sequence(seq, to_ix):
idxs = list(map(lambda w: to_ix[w] if w in to_ix.keys() else to_ix["<UNK>"], seq))
tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else Variable(torch.LongTensor(idxs))
return tensor
flatten = lambda l: [item for sublist in l for item in sublist]
def preprocessing(file_path,length):
"""
atis-2.train.w-intent.iob
"""
processed_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"data/")
print("processed_data_path : %s" % processed_path)
if os.path.exists(os.path.join(processed_path,"processed_train_data.pkl")):
train_data, word2index, tag2index, intent2index = pickle.load(open(os.path.join(processed_path,"processed_train_data.pkl"),"rb"))
return train_data, word2index, tag2index, intent2index
if not os.path.exists(processed_path):
os.makedirs(processed_path)
try:
train = open(file_path,"r").readlines()
print("Successfully load data. # of set : %d " % len(train))
except:
print("No such file!")
return None,None,None,None
try:
train = [t[:-1] for t in train]
train = [[t.split("\t")[0].split(" "),t.split("\t")[1].split(" ")[:-1],t.split("\t")[1].split(" ")[-1]] for t in train]
train = [[t[0][1:-1],t[1][1:],t[2]] for t in train]
seq_in,seq_out, intent = list(zip(*train))
vocab = set(flatten(seq_in))
slot_tag = set(flatten(seq_out))
intent_tag = set(intent)
print("# of vocab : {vocab}, # of slot_tag : {slot_tag}, # of intent_tag : {intent_tag}".format(vocab=len(vocab),slot_tag=len(slot_tag),intent_tag=len(intent_tag)))
except:
print("Please, check data format! It should be 'raw sentence \t BIO tag sequence intent'. The following is a sample.")
print("BOS i want to fly from baltimore to dallas round trip EOS\tO O O O O O B-fromloc.city_name O B-toloc.city_name B-round_trip I-round_trip atis_flight")
return None,None,None,None
sin=[]
sout=[]
for i in range(len(seq_in)):
temp = seq_in[i]
if len(temp)<length:
temp.append('<EOS>')
while len(temp)<length:
temp.append('<PAD>')
else:
temp = temp[:length]
temp[-1]='<EOS>'
sin.append(temp)
temp = seq_out[i]
if len(temp)<length:
while len(temp)<length:
temp.append('<PAD>')
else:
temp = temp[:length]
temp[-1]='<EOS>'
sout.append(temp)
word2index = {'<PAD>': 0, '<UNK>':1,'<SOS>':2,'<EOS>':3}
for token in vocab:
if token not in word2index.keys():
word2index[token]=len(word2index)
tag2index = {'<PAD>' : 0}
for tag in slot_tag:
if tag not in tag2index.keys():
tag2index[tag] = len(tag2index)
intent2index={}
for ii in intent_tag:
if ii not in intent2index.keys():
intent2index[ii] = len(intent2index)
train = list(zip(sin,sout,intent))
train_data=[]
for tr in train:
temp = prepare_sequence(tr[0],word2index)
temp = temp.view(1,-1)
temp2 = prepare_sequence(tr[1],tag2index)
temp2 = temp2.view(1,-1)
temp3 = Variable(torch.LongTensor([intent2index[tr[2]]])).cuda() if USE_CUDA else Variable(torch.LongTensor([intent2index[tr[2]]]))
train_data.append((temp,temp2,temp3))
pickle.dump((train_data,word2index,tag2index,intent2index),open(os.path.join(processed_path,"processed_train_data.pkl"),"wb"))
pickle
print("Preprocessing complete!")
return train_data, word2index, tag2index, intent2index
def getBatch(batch_size,train_data):
random.shuffle(train_data)
sindex=0
eindex=batch_size
while eindex < len(train_data):
batch = train_data[sindex:eindex]
temp = eindex
eindex = eindex+batch_size
sindex = temp
yield batch
def load_dictionary(dic_path):
processed_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"data/")
if os.path.exists(os.path.join(processed_path,"processed_train_data.pkl")):
_, word2index, tag2index, intent2index = pickle.load(open(os.path.join(processed_path,"processed_train_data.pkl"),"rb"))
return word2index, tag2index, intent2index
else:
print("Please, preprocess data first")
return None,None,None