Skip to content

Commit

Permalink
latest baseline
Browse files Browse the repository at this point in the history
  • Loading branch information
goruck committed Jul 10, 2023
1 parent d99dffb commit 7ea7e22
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 52 deletions.
38 changes: 27 additions & 11 deletions ml/dataset_management/refit/dataset_plot.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,32 @@
from nilm.Arguments import *
#from nilm.Arguments import *
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import argparse

appliance_name = 'fridge'
appliance_name = 'kettle'

#dataset = 'training'
dataset = 'test'
#dataset = 'validation'
#dataset = 'train'

for filename in os.listdir(args.datadir + appliance_name):
DATA_DIRECTORY = '/home/lindo/Develop/nilm/ml/dataset_management/refit/'

def get_arguments():
parser = argparse.ArgumentParser(description='sequence to point learning \
example for NILM')
parser.add_argument('--data_dir', type=str, default=DATA_DIRECTORY,
help='The directory containing the CLEAN REFIT data')
parser.add_argument('--appliance_name', type=str, default='kettle',
help='which appliance you want to train: kettle,\
microwave,fridge,dishwasher,washingmachine')
return parser.parse_args()

args = get_arguments()

for filename in os.listdir(args.data_dir + appliance_name):
if dataset == 'train' and dataset.upper() in filename.upper() and 'TEST' in filename.upper():
test_filename = filename
elif dataset == 'training' and dataset.upper() in filename.upper():
Expand All @@ -23,9 +38,9 @@

chunksize = 10 ** 6

for idx, chunk in enumerate(pd.read_csv(args.datadir + appliance_name + '/' + test_filename,
for idx, chunk in enumerate(pd.read_csv(args.data_dir + appliance_name + '/' + 'kettle_training_.csv',
# index_col=False,
names=['aggregate', appliance_name],
names=['aggregate', appliance_name, 'status'],
# usecols=[1, 2],
# iterator=True,
#skiprows=15 * 10 ** 6,
Expand All @@ -34,25 +49,26 @@
)):

# de-normalization
chunk['aggregate'] = chunk['aggregate'] * 822 + 522
chunk[appliance] = chunk[appliance] * params_appliance[args.appliance_name]['std'] \
+ params_appliance[args.appliance_name]['mean']
#chunk['aggregate'] = chunk['aggregate'] * 822 + 522
#chunk[appliance] = chunk[appliance] * params_appliance[args.appliance_name]['std'] \
#+ params_appliance[args.appliance_name]['mean']


fig = plt.figure(num='Figure {:}'.format(idx))
ax1 = fig.add_subplot(111)

ax1.plot(chunk['aggregate'])
ax1.plot(chunk[appliance_name])
ax1.plot(chunk['status'])

ax1.grid()
ax1.set_title('{:}'.format(test_filename), fontsize=14, fontweight='bold')
ax1.set_ylabel('Power normalized')
ax1.set_xlabel('samples')
ax1.legend(['aggregate', appliance_name])
ax1.legend(['aggregate', appliance_name, 'status'])

mng = plt.get_current_fig_manager()
mng.resize(*mng.window.maxsize())
plt.show(fig)
plt.show()

del chunk
del chunk
107 changes: 66 additions & 41 deletions ml/dataset_management/refit/normalize_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Scale datasets created by create_new_dataset.py.
"""Scale datasets created by create_new_dataset.py and add on-off status.
Copyright (c) 2023 Lindo St. Angel
"""
Expand Down Expand Up @@ -59,68 +59,93 @@ def get_zscore(value, values):

args = parser.parse_args()

print(f'Target appliance: {args.appliance}')
appliance = args.appliance

path = os.path.join(args.datadir, args.appliance)
print(f'Target appliance: {appliance}')

path = os.path.join(args.datadir, appliance)

# Get statistics from training dataset.
train_file_name = os.path.join(path, f'{args.appliance}_training_.csv')
train_file_name = os.path.join(path, f'{appliance}_training_.csv')
try:
df = load(train_file_name)
aggregate_power = df.loc[:, 'aggregate']
appliance_power = df.loc[:, appliance]

# Remove outliers.
#df = df[df < 10 * df.iloc[:,0].std()]

train_agg_mean = df.iloc[:,0].mean()
train_agg_std = df.iloc[:,0].std()
train_agg_mean = aggregate_power.mean()
train_agg_std = aggregate_power.std()
print(f'Training aggregate mean = {train_agg_mean}, std = {train_agg_std}')

train_app_mean = df.iloc[:,1].mean()
train_app_std = df.iloc[:,1].std()
train_app_mean = appliance_power.mean()
train_app_std = appliance_power.std()
print(f'Training appliance mean = {train_app_mean}, std = {train_app_std}')

train_app_min = df.iloc[:,1].min()
train_app_max = df.iloc[:,1].max()
train_app_min = appliance_power.min()
train_app_max = appliance_power.max()
print(f'Training appliance min = {train_app_min}, max = {train_app_max}')

del df
except Exception as e:
sys.exit(e)

# Standardize (or normalize) each dataset.
max_on_power = common.params_appliance[appliance]['max_on_power']

# Standardize (or normalize) each dataset and add status.
for _, file_name in enumerate(os.listdir(path)):
file_path = os.path.join(path, file_name)

df = load(file_path)

print(f'\nStatistics for {file_name}:')
print(df.iloc[:,0].describe())
print(df.iloc[:,1].describe())

if common.USE_ALT_STANDARDIZATION:
print('Using alt standardization')
print(f'\n*** Working on {file_name} ***')
print('Raw dataset statistics:')
print(df.loc[:, 'aggregate'].describe())
print(df.loc[:, appliance].describe())

# Limit appliance power to [0, max_on_power].
print(f'Limiting appliance power to [0, {max_on_power}]')
df.loc[:, appliance] = df.loc[:, appliance].clip(0, max_on_power)

# Get appliance status and add to end of dataframe.
print('Computing on-off status.')
status = common.compute_status(df.loc[:, appliance].to_numpy(), appliance)
df.insert(2, 'status', status)
num_on = len(df[df["status"]==1])
num_off = len(df[df["status"]==0])
print(f'Number of samples with on status: {num_on}')
print(f'Number of samples with off status: {num_off}')
assert num_on + num_off == df.iloc[:, 2].size

# Standardize aggregate dataset.
agg_mean = common.ALT_AGGREGATE_MEAN if common.USE_ALT_STANDARDIZATION else train_agg_mean
agg_std = common.ALT_AGGREGATE_STD if common.USE_ALT_STANDARDIZATION else train_agg_std
print(f'\nStandardizing aggregate dataset with mean = {agg_mean} and std = {agg_std}.')
df.iloc[:,0] = (df.iloc[:,0] - agg_mean) / agg_std

# Standardize appliance dataset.
alt_app_mean = common.params_appliance[args.appliance]['alt_app_mean']
alt_app_std = common.params_appliance[args.appliance]['alt_app_std']
app_mean = alt_app_mean if common.USE_ALT_STANDARDIZATION else train_app_mean
app_std = alt_app_std if common.USE_ALT_STANDARDIZATION else train_app_std
print(f'\nStandardizing appliance dataset with mean = {app_mean} and std = {app_std}.')
df.iloc[:,1] = (df.iloc[:,1] - app_mean) / app_std
print(f'Standardizing aggregate dataset with mean = {agg_mean} and std = {agg_std}.')
df.loc[:, 'aggregate'] = (df.loc[:, 'aggregate'] - agg_mean) / agg_std

# Scale appliance dataset.
if common.USE_APPLIANCE_NORMALIZATION:
# Normalize appliance dataset to [0, max_on_power].
min = 0
max = max_on_power
print(f'Normalizing appliance dataset with min = {min} and max = {max}.')
df.loc[:, appliance] = (df.loc[:, appliance] - min) / (max - min)
else:
# Standardize appliance dataset.
alt_app_mean = common.params_appliance[appliance]['alt_app_mean']
alt_app_std = common.params_appliance[appliance]['alt_app_std']
app_mean = alt_app_mean if common.USE_ALT_STANDARDIZATION else train_app_mean
app_std = alt_app_std if common.USE_ALT_STANDARDIZATION else train_app_std
print('Using alt standardization.' if common.USE_ALT_STANDARDIZATION
else 'Using default standardization.')
print(f'Standardizing appliance dataset with mean = {app_mean} and std = {app_std}.')
df.loc[:, appliance] = (df.loc[:, appliance] - app_mean) / app_std

### Other ways of scaling the datasets are commented out below ###
### The current method seems to give the best results ###

# Remove outliers.
# compute z-scores for all values
# THIS TAKES FOREVER - DO NOT USE
#df['z-score'] = df[args.appliance].apply(lambda x: get_zscore(x, df[args.appliance]))
#df['z-score'] = df[appliance].apply(lambda x: get_zscore(x, df[appliance]))
#outliers = df[df['z-score'] > 6]
#print(outliers)
#exit()
Expand Down Expand Up @@ -160,26 +185,26 @@ def get_zscore(value, values):
# Normalize appliance dataset to [0, 1].
#min = df.iloc[:,1].min()
#max = df.iloc[:,1].max()
#print(f'\nNormalizing appliance dataset with min = {min} and max = {max}')
#df.iloc[:,1] = (df.iloc[:,1] - min) / (max - min)
#print(f'Normalizing appliance dataset with min = {min} and max = {max}')
#df.iloc[:, 1] = (df.iloc[:, 1] - min) / (max - min)

print(f'\nStatistics for {file_name} after scaling:')
print(df.iloc[:,0].describe())
print(df.iloc[:,1].describe())
print(f'Statistics for {file_name} after scaling:')
print(df.loc[:, 'aggregate'].describe())
print(df.loc[:, appliance].describe())

# Show dataset histograms.
df.iloc[:,0].hist()
df.loc[:, 'aggregate'].hist()
plt.title(f'Histogram for {file_name} aggregate')
plt.show()
df.iloc[:,1].hist()
plt.title(f'Histogram for {file_name} {args.appliance}')
df.loc[:, appliance].hist()
plt.title(f'Histogram for {file_name} {appliance}')
plt.show()

# Check for NaNs.
print(f'\nNaNs present: {df.isnull().values.any()}')
print(f'NaNs present: {df.isnull().values.any()}')

# Save scaled dataset and overwrite existing csv.
print(f'\nSaving dataset to {file_path}.')
print(f'*** Saving dataset to {file_path}. ***')
df.to_csv(file_path, index=False)

del df

0 comments on commit 7ea7e22

Please sign in to comment.