-
Notifications
You must be signed in to change notification settings - Fork 4
/
Data_Preprocessing.py
198 lines (151 loc) · 7.21 KB
/
Data_Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import glob
import pandas as pd
import time
import numpy as np
import math
from config import *
from sklearn.model_selection import train_test_split
def join_multiple_channels():
path = r'../dataset/' # Path to dataset
allFiles = glob.glob(path + "/*.dat") # Data file name
i = 0 # Counter for appliances array
# Initialize the frame with timestamp (TS) as Index column
frame1 = pd.DataFrame()
# frame1.set_index('TS')
# Loop to load dataset into a single frame
for file_ in allFiles:
start = time.time()
# df = pd.read_csv(file_,delimiter = ' ' ,index_col=0, squeeze=True, header="None")
df = pd.read_csv(file_, delimiter=' ', names=['TS', file_[34:43]], header=None) # appliances[i]])
print("Reading", i, " passed")
frame1 = frame1.join(df.set_index('TS'), how='outer')
print("Joining", i, " passed")
end = time.time()
print("Time used: ", end - start)
i = i + 1
# df.set_index('TS')
print(frame1)
#Unimplemented
def load_appliance(appliance_name):
path = 'dataset/'+appliance_name+ ".dat" # Path to dataset
#file = glob.glob(path + "/"+ appliance_name+ ".dat") # Data file name
# Initialize the frame with timestamp (TS) as Index column
#frameA = pd.DataFrame()
# frame1.set_index('TS')
# Load dataset into a frame
start = time.time()
df = pd.read_csv(path, delimiter=' ', names='TS', header=None) # appliances[i]])
print("Reading", " passed")
end = time.time()
print("Time used: ", end - start)
#df.set_index('TS')
print(df[1:10])
def combine_dataset(appliance_name, type='default'):
if appliance_name == 'kettle':
appliance_name = 'channel_5'
path = 'dataset/'+appliance_name+ ".dat" # Path to dataset
frame1 = pd.DataFrame()
title = ['timestamp','appliance_power']
df = pd.read_csv("dataset/channel_1.dat",sep = ' ', header = None, float_precision='round_trip', names=['timestamp','aggregate_power'])
df2 = pd.read_csv(path, delimiter=' ', header=None, float_precision='round_trip', names=title) # appliances[i]])
frame1 = frame1.join(df.set_index('timestamp'), how='outer')
frame1 = frame1.join(df2.set_index('timestamp'), how='outer')
# Set the index into a frequency of 6 seconds
end = int(frame1.index[len(frame1) - 1])
start = int(frame1.index[0])
print("start", start)
print("end", end)
l = [np.int64(i) for i in np.arange(start, end + 6, 6)]
frame1 = frame1.reindex(l)
# Forward filling
frame1 = frame1.fillna(method='ffill')
# Backward filling
frame1 = frame1.fillna(method='bfill')
frame1 = frame1.reset_index()
columnsTitles = ["aggregate_power", "appliance_power", "timestamp"]
frame1 = frame1.reindex(columns=columnsTitles)
#print(frame1)
return frame1
#Slash data into 128 pieces
#X_train, X_test, y_train, y_test = train_test_split(df_input, df_target, test_size=1 / (1 + TRAIN_TEST_RATIO), random_state=RANDOM_SEED)
#return X_train, X_test, y_train, y_test
def save_activation(appliance_name):
df = load_data(appliance_name)
df['timestamp'].astype(np.int64)
df.insert(3, "end_time", df['timestamp'], True)
df.insert(4, "index_house", 1, True)
df.insert(5, "name_appliance", 'oven', True)
df.rename(columns={'timestamp':'start_time'}, inplace=True)
columnsTitles = ["start_time", "end_time", "aggregate_power", "appliance_power", "index_house", "name_appliance"]
df = df.reindex(columns=columnsTitles)
#Disable warning
pd.options.mode.chained_assignment = None # default='warn'
start = time.time()
row_count = df.shape[0]
for i in range (row_count):
j =df['end_time'][i]
df['end_time'][i] = j+6
end = time.time()
print("Time used: ", end - start)
print(df)
path2 = r'dataset/' # Path to dataset
df.to_csv(path2 + appliance_name + '_activation.csv', sep=",", index=False)
def generate_real_sample(appliance_name):
path = r'dataset/tobe_std_'
df = pd.read_csv(path+appliance_name+'.csv', delimiter=',') # appliances[i]])
#df = pd.read_csv(r'test.csv', delimiter=',') # appliances[i]])
seq_length = 128
data_set = {}
for i in range(seq_length):
data_set['aggregate_power_' + str(i)] = []
data_set['appliance_power_' + str(i)] = []
data_set['timestamp_' + str(i)] = []
data_set['house_name'] = []
set=0
total_set = int(len(df.index)/128)
for j in range(total_set):
for i in range(seq_length):
data_set['aggregate_power_' + str(i)].append(df['aggregate_power'][i+set])
data_set['appliance_power_' + str(i)].append(df['appliance_power'][i+set])
data_set['timestamp_' + str(i)].append(df['timestamp'][i+set])
data_set['house_name'].append(df['house_name'][i+set])
set = set+128
#print(data_set)
df2 = pd.DataFrame(data_set)
df2.to_csv('dataset/dataset_' + appliance_name + '.csv', sep=' ', index=False)
def standardize_dataset(appliance_name):
df = pd.read_csv(r'dataset/dataset_'+ appliance_name + '.csv', sep="\s+")
seq_length = math.ceil(APPLIANCE_CONFIG[appliance_name]['window_width'] / SAMPLE_WIDTH)
# Standardisation
print('standardize', appliance_name)
# get std of random sample
sample = df.sample()
aggregate_seq_sample = sample[['aggregate_power_' + str(i) for i in range(seq_length)]]
aggregate_seq_sample = np.array(
[aggregate_seq_sample['aggregate_power_' + str(i)].tolist()[0] for i in range(seq_length)])
aggregate_seq_sample = aggregate_seq_sample - aggregate_seq_sample.mean()
# print(aggregate_seq_sample, len(aggregate_seq_sample), np.std(aggregate_seq_sample))
sample_std = np.std(aggregate_seq_sample)
for i in range(seq_length):
print(round(100 * i / seq_length / 2, 1), '%')
i = str(i)
new_column = pd.Series((df['aggregate_power_' + i] - df['aggregate_power_' + i].mean()) / sample_std,
name='aggregate_power_' + i)
df.update(new_column)
max_power = APPLIANCE_CONFIG[appliance_name]['max_power']
for i in range(seq_length):
print(round(100 * i / seq_length / 2, 1), '%')
i = str(i)
new_column = pd.Series(df['appliance_power_' + i] / max_power, name='appliance_power_' + i)
df.update(new_column)
print(df)
df.to_csv(r'dataset/standardized_dataset_' + appliance_name + '.csv', sep=' ', index=False)
def load_test_train_data(appliance_name, type='default'):
df = pd.read_csv(PREPOCESSED_DATA_DIR + '/standardized_dataset_' + appliance_name + '.csv', sep="\s+")
seq_length = math.ceil(APPLIANCE_CONFIG[appliance_name]['window_width'] / SAMPLE_WIDTH)
df_input = df[[ 'aggregate_power_' + str(i) for i in range(seq_length)]]
df_target = df[[ 'appliance_power_' + str(i) for i in range(seq_length)]]
print(df_target)
X_train, X_test, y_train, y_test = train_test_split(df_input, df_target, test_size=1 / (1 +
TRAIN_TEST_RATIO), random_state=42)
return X_train, X_test, y_train, y_test