-
Notifications
You must be signed in to change notification settings - Fork 3
/
03cutdata.py
274 lines (218 loc) · 11.4 KB
/
03cutdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import numpy as np
from datetime import datetime
from tqdm import tqdm
import os
import time
import json
from tools import general
import numpy as np
@general.clock
def cutdataclass(data_dir, data_name, save_file=False):
print('\n' + '*' * 20)
op_data = os.path.join(data_dir, 'repair-' + data_name)
print("data_name location: %s" % (op_data))
with open(op_data, 'r', encoding="utf-8") as fr:
datas = fr.readlines()
sensor_data = ''
list_data = []
class_data = {}
# log
begin_index = ''
end_index = ''
pair_index = {}
dic_index = {}
dict_length = {"static": [], "detail": []}
# load data
data_dir = os.path.join(data_dir, '..', 'static')
data_static = os.path.join(data_dir, 'activities.json')
with open(data_static, 'r', encoding="utf-8") as fr2:
dictclass = json.load(fr2)
for classtype in dictclass[data_name].keys(): # 逐个遍历每项活动
flag = False
list_data = []
# log
begin_index = ''
end_index = ''
pair_index = {}
logindex = 0
length_list = []
for i, line in enumerate(datas):
try:
if flag:
list_data.append(line)
if classtype in line.split():
if 'begin' in line:
list_data.append(line)
if flag == True:
print('This indicates that the previous activity did not end normally...')
flag = True
# log
begin_index = i
if 'end' in line:
if flag == False:
print('Borrow previous start and end(%d, %d) ' % (begin_index, end_index))
print('Data from:%s, row: %d \n\n' % (classtype, i))
flag = False
# log
end_index = i
pair_index.update({str(logindex): str(begin_index) + ',' + str(end_index) + ',' + str(
end_index - begin_index)})
length_list.append((end_index - begin_index))
if (end_index - begin_index) < 4:
print('What iss the problem? Why is it so short. The length is: %d' % (
end_index - begin_index))
print('Data from:%s, row: %d \n\n' % (classtype, i))
logindex = logindex + 1
except IndexError:
print(i, line)
class_data.update({classtype: list_data})
# log
dict_length["static"].append({classtype: "{} +/- {}".format(np.median(length_list), np.std(length_list))})
dict_length["detail"].append([length_list])
dic_index.update({classtype: pair_index})
if flag == True: # I just want to do a verification to avoid some data having a beginning and no end
print('You donot have a normal end, brother...')
# 是否保存文件
if save_file == True:
file_dir = os.path.join(data_dir, '..', 'temp', 'cutedata')
subdir = os.path.join(file_dir, data_name)
general.create_folder(subdir)
for activity_name in class_data:
with open(os.path.join(subdir, activity_name), 'w', encoding="utf-8") as fw:
fw.writelines(class_data[activity_name])
with open(data_static, 'w', encoding="utf-8") as fw:
for activity_name in dic_index:
dictclass[data_name].update({activity_name: dic_index[activity_name]})
json.dump(dictclass, fw)
dataindex_dir = os.path.join(data_dir, '..', 'dataindex')
# log_dir = file_dir + 'log\\'
general.create_folder(dataindex_dir)
log_name = data_name + '-log'
with open(os.path.join(dataindex_dir, log_name), 'w', encoding="utf-8") as fw:
# log
for activity_name in dic_index:
fw.writelines('\n%s:\n' % (activity_name))
fw.writelines(str(dic_index[activity_name]))
length_log_name = data_name + "_activities_length.json"
with open(os.path.join(dataindex_dir, length_log_name), "w", encoding="utf-8") as fw:
json.dump(dict_length, fw)
print('The operation is completed. Now the file has been saved to: %s' % (subdir))
@general.clock
def verifyindex(data_dir, data_name, save_file=False):
static_data_dir = os.path.join(data_dir, '..', 'static')
data_static = os.path.join(static_data_dir, 'activities.json')
with open(data_static, 'r', encoding="utf-8") as fr:
dictclass = json.load(fr)
with open(os.path.join(data_dir, 'repair-' + data_name), 'r', encoding="utf-8") as f:
datasets = f.readlines()
data_index = dictclass[data_name]
for activity_type in data_index:
if activity_type == 'other':
continue
print('dataset: %s _ %s ...' % (data_name, activity_type))
for str_index in data_index[activity_type]:
index_array = np.array(data_index[activity_type][str_index].split(','), dtype=int)
try:
begin_activity_type = datasets[index_array[0]].split()
if activity_type != begin_activity_type[4] or 'begin' != begin_activity_type[5] or len(
begin_activity_type) != 6:
print('\nWarning at the end of the sequence:%s: %s \n' % (str_index, index_array))
end_activity_type = datasets[index_array[1]].split()
if activity_type != end_activity_type[4] or 'end' != end_activity_type[5] or len(
end_activity_type) != 6:
print('\nWarning at the begin of the sequence:%s: %s\n' % (str_index, index_array))
except IndexError:
print('There is a data problem. Please check the start and end index array values...')
print(index_array)
pass
def saveother_index(data_dir, data_name, save_file=False):
static_data_dir = os.path.join(data_dir, '..', 'static')
data_static = os.path.join(static_data_dir, 'activities.json')
with open(data_static, 'r', encoding="utf-8") as fr:
dictclass = json.load(fr)
with open(os.path.join(data_dir, 'repair-' + data_name), 'r', encoding="utf-8") as fr:
datasets = fr.readlines()
int_index_set = set()
dict_other_index = {}
other_activity_index = 0
data_index = dictclass[data_name]
for activity_type in data_index:
if activity_type == 'other':
continue
pass
print('dataset: %s _ %s ...' % (data_name, activity_type))
for str_index in data_index[activity_type]:
index_array = np.array(data_index[activity_type][str_index].split(','), dtype=int)
for i in range(index_array[0], index_array[1] + 1):
int_index_set.add(i)
for i in range(0, len(datasets)):
if i == 0 and i not in int_index_set:
other_index_begin = i
elif i - 1 in int_index_set and i not in int_index_set:
other_index_begin = i
elif i not in int_index_set and i + 1 in int_index_set:
other_index_end = i
dict_other_index.update(
{str(other_activity_index): str(other_index_begin) + ',' + str(other_index_end)})
other_activity_index += 1
other_index_begin = float('Inf')
elif i == len(datasets) - 1 and other_index_begin != float('Inf'):
dict_other_index.update(
{str(other_activity_index): str(other_index_begin) + ',' + str(len(datasets) - 1)})
with open(data_static, 'w', encoding="utf-8") as fw:
dictclass[data_name].update({'other': dict_other_index})
json.dump(dictclass, fw)
@general.clock
def savecute_data(data_dir, data_name, save_file=False):
static_data_dir = os.path.join(data_dir, '..', 'static')
data_static = os.path.join(static_data_dir, 'activities.json')
with open(data_static, 'r', encoding="utf-8") as fr:
dictclass = json.load(fr)
with open(os.path.join(data_dir, 'repair-' + data_name), 'r', encoding="utf-8") as f:
datasets = f.readlines()
data_index = dictclass[data_name]
for activity_type in data_index:
lines = []
print('datasets: %s _ %s...' % (data_name, activity_type))
for str_index in data_index[activity_type]:
index_array = np.array(data_index[activity_type][str_index].split(','), dtype=int)
for i in range(index_array[0], index_array[1] + 1):
lines.append(str(i).zfill(7) + '\t' + datasets[i])
lines.append('\n')
activity_dir = os.path.join(data_dir, "..", 'cutdata', data_name)
general.create_folder(activity_dir)
with open(os.path.join(activity_dir, activity_type), 'w', encoding="utf-8") as fw:
fw.writelines(lines)
if __name__ == '__main__':
opts = general.load_config()
data_dir = os.path.join(opts["datasets"]["base_dir"], 'repairdata')
# data_names = ['cairo', 'milan', 'kyoto7', 'kyoto8', 'kyoto11']
data_names = opts["datasets"]["names"]
# Step 1
for data_name in data_names:
cutdataclass(data_dir, data_name, save_file=True)
pass
print(
'Now the first step is to: \n\tthe index value of the source data is obtained. For the next step, it needs to be stored in the corresponding file of config. PS: there is no other type of data yet')
# Step 2
for data_name in data_names:
verifyindex(data_dir, data_name, save_file=True)
pass
print(
'\n\n第二步:\n\tIf there is no problem in the program, it means there is no problem, and it is guaranteed that each line has only the beginning or end of an action')
print('This second step is only for verification and has no substantive effect.')
# Step 3
for data_name in data_names:
saveother_index(data_dir, data_name, save_file=True)
pass
print(
'\n\nStep 3:\n\tOther index data is generated to the folder and manually copied to config / index + data_ In the name file, add the index of other')
# The first step is to log, get the start and end of all activity data, and cut the data into corresponding folders
# The second step is just verification. If there is no problem data, it indicates that the index value segmentation is correct
# The third step is to find out the active index value of other according to the active index value, and then save the corresponding config folder (including the log in the first step and the otherindex in the third step)
# Now in step 4, correctly segment the corresponding data into the folder according to the config index value
for data_name in data_names:
savecute_data(data_dir, data_name, save_file=True)
pass
print(
'Now that the data has been segmented, we will move to the next step to segment the data according to the distance!')