diff --git a/ml/dataset_management/refit/dataset_plot.py b/ml/dataset_management/refit/dataset_plot.py index 5a3f19a..cbd3959 100644 --- a/ml/dataset_management/refit/dataset_plot.py +++ b/ml/dataset_management/refit/dataset_plot.py @@ -1,17 +1,32 @@ -from nilm.Arguments import * +#from nilm.Arguments import * import numpy as np import os import matplotlib.pyplot as plt import pandas as pd +import argparse -appliance_name = 'fridge' +appliance_name = 'kettle' #dataset = 'training' dataset = 'test' #dataset = 'validation' #dataset = 'train' -for filename in os.listdir(args.datadir + appliance_name): +DATA_DIRECTORY = '/home/lindo/Develop/nilm/ml/dataset_management/refit/' + +def get_arguments(): + parser = argparse.ArgumentParser(description='sequence to point learning \ + example for NILM') + parser.add_argument('--data_dir', type=str, default=DATA_DIRECTORY, + help='The directory containing the CLEAN REFIT data') + parser.add_argument('--appliance_name', type=str, default='kettle', + help='which appliance you want to train: kettle,\ + microwave,fridge,dishwasher,washingmachine') + return parser.parse_args() + +args = get_arguments() + +for filename in os.listdir(args.data_dir + appliance_name): if dataset == 'train' and dataset.upper() in filename.upper() and 'TEST' in filename.upper(): test_filename = filename elif dataset == 'training' and dataset.upper() in filename.upper(): @@ -23,9 +38,9 @@ chunksize = 10 ** 6 -for idx, chunk in enumerate(pd.read_csv(args.datadir + appliance_name + '/' + test_filename, +for idx, chunk in enumerate(pd.read_csv(args.data_dir + appliance_name + '/' + 'kettle_training_.csv', # index_col=False, - names=['aggregate', appliance_name], + names=['aggregate', appliance_name, 'status'], # usecols=[1, 2], # iterator=True, #skiprows=15 * 10 ** 6, @@ -34,9 +49,9 @@ )): # de-normalization - chunk['aggregate'] = chunk['aggregate'] * 822 + 522 - chunk[appliance] = chunk[appliance] * params_appliance[args.appliance_name]['std'] \ - + params_appliance[args.appliance_name]['mean'] + #chunk['aggregate'] = chunk['aggregate'] * 822 + 522 + #chunk[appliance] = chunk[appliance] * params_appliance[args.appliance_name]['std'] \ + #+ params_appliance[args.appliance_name]['mean'] fig = plt.figure(num='Figure {:}'.format(idx)) @@ -44,15 +59,16 @@ ax1.plot(chunk['aggregate']) ax1.plot(chunk[appliance_name]) + ax1.plot(chunk['status']) ax1.grid() ax1.set_title('{:}'.format(test_filename), fontsize=14, fontweight='bold') ax1.set_ylabel('Power normalized') ax1.set_xlabel('samples') - ax1.legend(['aggregate', appliance_name]) + ax1.legend(['aggregate', appliance_name, 'status']) mng = plt.get_current_fig_manager() mng.resize(*mng.window.maxsize()) - plt.show(fig) + plt.show() - del chunk + del chunk \ No newline at end of file diff --git a/ml/dataset_management/refit/normalize_dataset.py b/ml/dataset_management/refit/normalize_dataset.py index 60f3633..cf250bd 100644 --- a/ml/dataset_management/refit/normalize_dataset.py +++ b/ml/dataset_management/refit/normalize_dataset.py @@ -1,4 +1,4 @@ -"""Scale datasets created by create_new_dataset.py. +"""Scale datasets created by create_new_dataset.py and add on-off status. Copyright (c) 2023 Lindo St. Angel """ @@ -59,60 +59,85 @@ def get_zscore(value, values): args = parser.parse_args() - print(f'Target appliance: {args.appliance}') + appliance = args.appliance - path = os.path.join(args.datadir, args.appliance) + print(f'Target appliance: {appliance}') + + path = os.path.join(args.datadir, appliance) # Get statistics from training dataset. - train_file_name = os.path.join(path, f'{args.appliance}_training_.csv') + train_file_name = os.path.join(path, f'{appliance}_training_.csv') try: df = load(train_file_name) + aggregate_power = df.loc[:, 'aggregate'] + appliance_power = df.loc[:, appliance] - # Remove outliers. - #df = df[df < 10 * df.iloc[:,0].std()] - - train_agg_mean = df.iloc[:,0].mean() - train_agg_std = df.iloc[:,0].std() + train_agg_mean = aggregate_power.mean() + train_agg_std = aggregate_power.std() print(f'Training aggregate mean = {train_agg_mean}, std = {train_agg_std}') - train_app_mean = df.iloc[:,1].mean() - train_app_std = df.iloc[:,1].std() + train_app_mean = appliance_power.mean() + train_app_std = appliance_power.std() print(f'Training appliance mean = {train_app_mean}, std = {train_app_std}') - train_app_min = df.iloc[:,1].min() - train_app_max = df.iloc[:,1].max() + train_app_min = appliance_power.min() + train_app_max = appliance_power.max() print(f'Training appliance min = {train_app_min}, max = {train_app_max}') del df except Exception as e: sys.exit(e) - # Standardize (or normalize) each dataset. + max_on_power = common.params_appliance[appliance]['max_on_power'] + + # Standardize (or normalize) each dataset and add status. for _, file_name in enumerate(os.listdir(path)): file_path = os.path.join(path, file_name) df = load(file_path) - print(f'\nStatistics for {file_name}:') - print(df.iloc[:,0].describe()) - print(df.iloc[:,1].describe()) - - if common.USE_ALT_STANDARDIZATION: - print('Using alt standardization') + print(f'\n*** Working on {file_name} ***') + print('Raw dataset statistics:') + print(df.loc[:, 'aggregate'].describe()) + print(df.loc[:, appliance].describe()) + + # Limit appliance power to [0, max_on_power]. + print(f'Limiting appliance power to [0, {max_on_power}]') + df.loc[:, appliance] = df.loc[:, appliance].clip(0, max_on_power) + + # Get appliance status and add to end of dataframe. + print('Computing on-off status.') + status = common.compute_status(df.loc[:, appliance].to_numpy(), appliance) + df.insert(2, 'status', status) + num_on = len(df[df["status"]==1]) + num_off = len(df[df["status"]==0]) + print(f'Number of samples with on status: {num_on}') + print(f'Number of samples with off status: {num_off}') + assert num_on + num_off == df.iloc[:, 2].size # Standardize aggregate dataset. agg_mean = common.ALT_AGGREGATE_MEAN if common.USE_ALT_STANDARDIZATION else train_agg_mean agg_std = common.ALT_AGGREGATE_STD if common.USE_ALT_STANDARDIZATION else train_agg_std - print(f'\nStandardizing aggregate dataset with mean = {agg_mean} and std = {agg_std}.') - df.iloc[:,0] = (df.iloc[:,0] - agg_mean) / agg_std - - # Standardize appliance dataset. - alt_app_mean = common.params_appliance[args.appliance]['alt_app_mean'] - alt_app_std = common.params_appliance[args.appliance]['alt_app_std'] - app_mean = alt_app_mean if common.USE_ALT_STANDARDIZATION else train_app_mean - app_std = alt_app_std if common.USE_ALT_STANDARDIZATION else train_app_std - print(f'\nStandardizing appliance dataset with mean = {app_mean} and std = {app_std}.') - df.iloc[:,1] = (df.iloc[:,1] - app_mean) / app_std + print(f'Standardizing aggregate dataset with mean = {agg_mean} and std = {agg_std}.') + df.loc[:, 'aggregate'] = (df.loc[:, 'aggregate'] - agg_mean) / agg_std + + # Scale appliance dataset. + if common.USE_APPLIANCE_NORMALIZATION: + # Normalize appliance dataset to [0, max_on_power]. + min = 0 + max = max_on_power + print(f'Normalizing appliance dataset with min = {min} and max = {max}.') + df.loc[:, appliance] = (df.loc[:, appliance] - min) / (max - min) + else: + # Standardize appliance dataset. + alt_app_mean = common.params_appliance[appliance]['alt_app_mean'] + alt_app_std = common.params_appliance[appliance]['alt_app_std'] + app_mean = alt_app_mean if common.USE_ALT_STANDARDIZATION else train_app_mean + app_std = alt_app_std if common.USE_ALT_STANDARDIZATION else train_app_std + print('Using alt standardization.' if common.USE_ALT_STANDARDIZATION + else 'Using default standardization.') + print(f'Standardizing appliance dataset with mean = {app_mean} and std = {app_std}.') + df.loc[:, appliance] = (df.loc[:, appliance] - app_mean) / app_std ### Other ways of scaling the datasets are commented out below ### ### The current method seems to give the best results ### @@ -120,7 +145,7 @@ def get_zscore(value, values): # Remove outliers. # compute z-scores for all values # THIS TAKES FOREVER - DO NOT USE - #df['z-score'] = df[args.appliance].apply(lambda x: get_zscore(x, df[args.appliance])) + #df['z-score'] = df[appliance].apply(lambda x: get_zscore(x, df[appliance])) #outliers = df[df['z-score'] > 6] #print(outliers) #exit() @@ -160,26 +185,26 @@ def get_zscore(value, values): # Normalize appliance dataset to [0, 1]. #min = df.iloc[:,1].min() #max = df.iloc[:,1].max() - #print(f'\nNormalizing appliance dataset with min = {min} and max = {max}') - #df.iloc[:,1] = (df.iloc[:,1] - min) / (max - min) + #print(f'Normalizing appliance dataset with min = {min} and max = {max}') + #df.iloc[:, 1] = (df.iloc[:, 1] - min) / (max - min) - print(f'\nStatistics for {file_name} after scaling:') - print(df.iloc[:,0].describe()) - print(df.iloc[:,1].describe()) + print(f'Statistics for {file_name} after scaling:') + print(df.loc[:, 'aggregate'].describe()) + print(df.loc[:, appliance].describe()) # Show dataset histograms. - df.iloc[:,0].hist() + df.loc[:, 'aggregate'].hist() plt.title(f'Histogram for {file_name} aggregate') plt.show() - df.iloc[:,1].hist() - plt.title(f'Histogram for {file_name} {args.appliance}') + df.loc[:, appliance].hist() + plt.title(f'Histogram for {file_name} {appliance}') plt.show() # Check for NaNs. - print(f'\nNaNs present: {df.isnull().values.any()}') + print(f'NaNs present: {df.isnull().values.any()}') # Save scaled dataset and overwrite existing csv. - print(f'\nSaving dataset to {file_path}.') + print(f'*** Saving dataset to {file_path}. ***') df.to_csv(file_path, index=False) del df \ No newline at end of file