usgs_shift_or_no_shift.py

# -*- coding: utf-8 -*-
"""usgs-shift-or-no-shift.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1zNdwJsl-frHoxcEZj8HwwNRdxh1D35Ls
"""

#from google.colab import drive
#drive.mount('/content/drive',force_remount=True)
#!pip3 install pysindy
#import pysindy as ps
import scipy.stats as stats
import os
import pandas as pd
import numpy as np
pd.set_option("display.precision", 3)
import matplotlib.pyplot as plt

#folder_path = str('/content/drive/MyDrive/PhD Admin and Notes/paper1/revisions-code/usgs_modpods_results')
folder_path = "G:/My Drive/PhD Admin and Notes/paper1/revisions-code/usgs_modpods_results"

shifted_train = dict()
shifted_eval = dict()
noshift_train = dict()
noshift_eval = dict()
performance_summaries = dict()
for subdir, dirs, files in os.walk(folder_path):
    print(subdir)
    for file in files:
        if("error_metrics" in str(os.path.join(subdir, file))):
          print(str(subdir)[75:75+8])
          site_id = str(subdir)[75:75+8]
          # only look at the linear models
          if ("po_1" in str(os.path.join(subdir, file))):
            if ("training" in str(os.path.join(subdir, file))):
              if ("no_shift" in str(os.path.join(subdir, file))):
                noshift_train[site_id] = pd.read_csv(str(os.path.join(subdir, file)))
              else:
                shifted_train[site_id] = pd.read_csv(str(os.path.join(subdir, file)))
            elif ("eval" in  str(os.path.join(subdir, file))):
              if ("no_shift" in str(os.path.join(subdir, file))):
                noshift_eval[site_id] = pd.read_csv(str(os.path.join(subdir, file)))
              else:
                shifted_eval[site_id] = pd.read_csv(str(os.path.join(subdir, file)))

          #print(str(file))
          #print(str(os.path.join(subdir, file)))
          #site_id = str(file).partition('_')[2][:-33]
          #print(site_id)
          #performance_summaries[site_id] = pd.read_csv(str(os.path.join(subdir, file)))
          #trained_site_ids.append(str(subdir)[-8:])
        #print(os.path.join(subdir, file))
#print(shifted_train['03439000'].NSE.mean())
for site_id in shifted_train:
  print(shifted_train[site_id])

# grab all the NSE's from each type and make a list
shift_train_NSE = list()
for site_id in shifted_train:
  shift_train_NSE.append(shifted_train[site_id].NSE.max())
shift_eval_NSE = list()
for site_id in shifted_eval:
  shift_eval_NSE.append(shifted_eval[site_id].NSE.max())
noshift_train_NSE = list()

for site_id in noshift_train:
  noshift_train_NSE.append(noshift_train[site_id].NSE.max())
noshift_eval_NSE = list()
for site_id in noshift_eval:
  noshift_eval_NSE.append(noshift_eval[site_id].NSE.max())
print(shift_train_NSE)
print(shift_eval_NSE)
print(noshift_train_NSE)
print(noshift_eval_NSE)

print("Train Failure Rate (NSE < 0)")
print("Shift: ", len([x for x in shift_train_NSE if x < 0]) / len(shift_train_NSE))
print("No Shift:", len([x for x in noshift_train_NSE if x < 0]) / len(noshift_train_NSE))

print("Eval Failure Rate (NSE < 0)")
print("Shift: ", len([x for x in shift_eval_NSE if x < 0]) / len(shift_eval_NSE))
print("No Shift:", len([x for x in noshift_eval_NSE if x < 0]) / len(noshift_eval_NSE))

print("Train Failure Rate (NSE < -100)")
print("Shift: ", len([x for x in shift_train_NSE if x < -100]) / len(shift_train_NSE))
print("No Shift:", len([x for x in noshift_train_NSE if x < -100]) / len(noshift_train_NSE))

print("Eval Failure Rate (NSE < -100)")
print("Shift: ", len([x for x in shift_eval_NSE if x < -100]) / len(shift_eval_NSE))
print("No Shift:", len([x for x in noshift_eval_NSE if x < -100]) / len(noshift_eval_NSE))
print("length of noshift_eval_nse")
print(len(noshift_eval_NSE))
print("length of shift_eval_nse")
print(len(shift_eval_NSE))

shift_train_bins = np.linspace(0,1,len(shift_train_NSE))
shift_eval_bins = np.linspace(0,1,len(shift_eval_NSE))
noshift_train_bins = np.linspace(0,1,len(noshift_train_NSE))
noshift_eval_bins = np.linspace(0,1,len(noshift_eval_NSE))
plt.figure(figsize=(8,8))
plt.plot(np.sort(shift_train_NSE),shift_train_bins,'b--',label='Train [Shifted]')
plt.plot(np.sort(shift_eval_NSE),shift_eval_bins,'r--',label='Evaluation [Shifted]')
plt.plot(np.sort(noshift_train_NSE),noshift_train_bins,'g-',label='Train [Not Shifted]')
plt.plot(np.sort(noshift_eval_NSE),noshift_eval_bins,'y-',label='Evaluation [Not Shifted]')
plt.xlabel("NSE",fontsize='x-large')
plt.xlim([-1,1])
plt.ylabel("Cumulative Density",fontsize='x-large')
plt.legend(fontsize='x-large',loc='best')
plt.title("USGS Gaging Stations",fontsize='x-large')
plt.savefig(str(folder_path + '/usgs_shift_or_no_shift.png'),format='png',dpi=600)
plt.show()