-
Notifications
You must be signed in to change notification settings - Fork 0
/
usgs_parameters_correlation.py
259 lines (231 loc) · 11.6 KB
/
usgs_parameters_correlation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# looks very similar to the camles script
# except the catchment attributres are different
import pickle
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
pd.set_option("display.precision", 2)
# grab site attributes from the usgs rest api (only needs to be done once)
'''
# load sites from text file
with open("usgs_modpods_sites.txt",'r') as fp:
sites = fp.read().splitlines()
attributes = pd.DataFrame()
for site_id in sites:
#print("\n" + str(site_id) + "\n")
request_string = str("https://waterservices.usgs.gov/nwis/site/?format=rdb&sites=" + str(site_id) + "&siteOutput=expanded&siteStatus=all")
request_string = request_string.replace(" ","")
print(request_string)
response = pd.read_csv(request_string, sep='\t', comment = '#', header=[0])
response = response[1:] # get rid of first row (data type)
#print(response.columns)
response.set_index('site_no',inplace=True)
# subset of hydrologically relevant attributes
response = response[['site_tp_cd','dec_lat_va','dec_long_va',
'state_cd','alt_va','huc_cd','basin_cd','topo_cd',
'drain_area_va','contrib_drain_area_va']]
#print(response)
if attributes.empty:
# initialize attributes dataframe
attributes = response
else:
# append to attributes dataframe
attributes = pd.concat((attributes,response),axis='index')
print(attributes)
# save attributes to csv
attributes.to_csv("usgs_modpods_attributes.csv")
'''
# load attributes from csv
attributes = pd.read_csv("usgs_modpods_attributes.csv",index_col=0)
print(attributes)
# for grabbing model performance and parameters
folder_path = str('G:/My Drive/PhD Admin and Notes/paper1/revisions-code/usgs_modpods_results')
shifted_eval = dict()
# below is (nearly) verbatim from camels_parameter_correlation.py
# separate results by polynomial order and number of transformations
p1t1 = pd.DataFrame(pd.np.empty((0, 9 ) ) )
p1t2 = pd.DataFrame(pd.np.empty((0, 9 ) ) )
p2t1 = pd.DataFrame(pd.np.empty((0, 9 ) ) )
p2t2 = pd.DataFrame(pd.np.empty((0, 9 ) ) )
p3t1 = pd.DataFrame(pd.np.empty((0, 9 ) ) )
p3t2 = pd.DataFrame(pd.np.empty((0, 9 ) ) )
# walk over results grabbing error metrics and model parameters
print("grabbing error metrics")
for subdir, dirs, files in os.walk(folder_path):
for file in files:
# grabbing error metrics
if("error_metrics" in str(os.path.join(subdir, file))):
# only look at the shifted models for now
if ("eval" in str(os.path.join(subdir, file))):
if ("shift" in str(os.path.join(subdir, file))):
# there's a better way than using literal string indices here
#print(str(subdir)[75:75+8])
site_id = int(str(subdir)[75:75+8])
print(site_id)
shifted_eval[site_id] = pd.read_csv(str(os.path.join(subdir, file)),index_col=0)
if ("po_1" in str(os.path.join(subdir, file))):
if p1t1.empty:
p1t1.columns = shifted_eval[site_id].columns
p1t1.loc[site_id,:] = shifted_eval[site_id].loc[1,:]
try:
if p1t2.empty:
p1t2.columns = shifted_eval[site_id].columns
p1t2.loc[site_id,:] = shifted_eval[site_id].loc[2,:]
except:
pass
if ("po_2" in str(os.path.join(subdir, file))):
shifted_eval[site_id] = pd.read_csv(str(os.path.join(subdir, file)))
#print(shifted_eval[site_id])
try:
if p2t1.empty:
p2t1.columns = shifted_eval[site_id].columns
p2t1.loc[site_id,:] = shifted_eval[site_id].loc[1,:]
except:
pass # that model config wasn't trained, not an error
try:
if p2t2.empty:
p2t2.columns = shifted_eval[site_id].columns
p2t2.loc[site_id,:] = shifted_eval[site_id].loc[2,:]
except:
pass
if ("po_3" in str(os.path.join(subdir, file))):
shifted_eval[site_id] = pd.read_csv(str(os.path.join(subdir, file)))
#print(shifted_eval[site_id])
try:
if p3t1.empty:
p3t1.columns = shifted_eval[site_id].columns
p3t1.loc[site_id,:] = shifted_eval[site_id].loc[1,:]
except:
pass # that model config wasn't trained, not an error
try:
if p3t2.empty:
p3t2.columns = shifted_eval[site_id].columns
p3t2.loc[site_id,:] = shifted_eval[site_id].loc[2,:]
except:
pass
print(p1t1)
p1t1[['auto1','instant1','tr1_1','tr1_shape','tr1_scale','tr1_loc','tr1_tP','tr1_t50']] = np.nan
p1t2[['auto1','instant1','tr1_1','tr2_1','tr1_shape',
'tr1_scale','tr1_loc','tr1_tP','tr1_t50',
'tr2_shape','tr2_scale','tr2_loc','tr2_tP','tr2_t50']] = np.nan
# might not interpret model parameters for any higher order models than these
print("grabbing model parameters")
for subdir, dirs, files in os.walk(folder_path):
for file in files:
# grabbing model parameters
if ("rainfall_runoff_model" in str(os.path.join(subdir, file)) and "shift" in str(os.path.join(subdir, file)) ):
# open the file using pickle
with open(str(os.path.join(subdir, file)), 'rb') as f:
model = pickle.load(f)
#print(model)
if ("po_1" in str(os.path.join(subdir, file))):
#print(str(os.path.join(subdir, file)))
site_id = int(str(subdir)[75:75+8])
#print(site_id)
#print(str(os.path.join(subdir, file)))
# coefficient terms for p1t1
#print(model[1]['final_model']['model'].coefficients())
auto1 = model[1]['final_model']['model'].coefficients()[0][0]
instant1 = model[1]['final_model']['model'].coefficients()[0][1]
tr1_1 = model[1]['final_model']['model'].coefficients()[0][2]
# transformation parameters for p1t1
tr1_shape = model[1]['shape_factors'].iloc[0,0]
tr1_scale = model[1]['scale_factors'].iloc[0,0]
tr1_loc = model[1]['loc_factors'].iloc[0,0]
tr1_tP = (tr1_shape - 1)/(1/tr1_scale) + tr1_loc
# the scale parameter in scipy.stats.gamma is the inverse of beta
tr1_t50 = (tr1_shape)/(1/tr1_scale) + tr1_loc
p1t1.loc[site_id,'auto1'] = auto1
p1t1.loc[site_id,'instant1'] = instant1
p1t1.loc[site_id,'tr1_1'] = tr1_1
p1t1.loc[site_id,'tr1_shape'] = tr1_shape
p1t1.loc[site_id,'tr1_scale'] = tr1_scale
p1t1.loc[site_id,'tr1_loc'] = tr1_loc
p1t1.loc[site_id,'tr1_tP'] = tr1_tP
p1t1.loc[site_id,'tr1_t50'] = tr1_t50
#print(p1t1)
# coefficient terms for p1t2
#print(model[2]['final_model']['model'].coefficients())
auto1 = model[2]['final_model']['model'].coefficients()[0][0]
instant1 = model[2]['final_model']['model'].coefficients()[0][1]
tr1_1 = model[2]['final_model']['model'].coefficients()[0][2]
tr2_1 = model[2]['final_model']['model'].coefficients()[0][3]
# transformation parameters for p1t2
tr1_shape = model[2]['shape_factors'].iloc[0,0]
tr1_scale = model[2]['scale_factors'].iloc[0,0]
tr1_loc = model[2]['loc_factors'].iloc[0,0]
tr1_tP = (tr1_shape - 1)/(1/tr1_scale) + tr1_loc
tr1_t50 = (tr1_shape)/(1/tr1_scale) + tr1_loc
tr2_shape = model[2]['shape_factors'].iloc[1,0]
tr2_scale = model[2]['scale_factors'].iloc[1,0]
tr2_loc = model[2]['loc_factors'].iloc[1,0]
tr2_tP = (tr2_shape - 1)/(1/tr2_scale) + tr2_loc
tr2_t50 = (tr2_shape)/(1/tr2_scale) + tr2_loc
p1t2.loc[site_id,'auto1'] = auto1
p1t2.loc[site_id,'instant1'] = instant1
p1t2.loc[site_id,'tr1_1'] = tr1_1
p1t2.loc[site_id,'tr2_1'] = tr2_1
p1t2.loc[site_id,'tr1_shape'] = tr1_shape
p1t2.loc[site_id,'tr1_scale'] = tr1_scale
p1t2.loc[site_id,'tr1_loc'] = tr1_loc
p1t2.loc[site_id,'tr1_tP'] = tr1_tP
p1t2.loc[site_id,'tr1_t50'] = tr1_t50
p1t2.loc[site_id,'tr2_shape'] = tr2_shape
p1t2.loc[site_id,'tr2_scale'] = tr2_scale
p1t2.loc[site_id,'tr2_loc'] = tr2_loc
p1t2.loc[site_id,'tr2_tP'] = tr2_tP
p1t2.loc[site_id,'tr2_t50'] = tr2_t50
#print(p1t2)
print("p1t1")
print(p1t1)
print("p1t2")
print(p1t2)
print("p2t1")
print(p2t1)
# add catchment characteristics to the dataframe
# and then drop the rows of p1t1 with nans in the first nine columns
p1t1 = pd.concat((p1t1,attributes),axis=1)
#p1t1 = p1t1.dropna(subset=p1t1.columns[0:17])
p1t1 = p1t1.dropna(subset='NSE')
p1t2 = pd.concat((p1t2, attributes), axis=1)
#p1t2 = p1t2.dropna(subset=p1t2.columns[0:23])
p1t2 = p1t2.dropna(subset='NSE')
p2t1 = pd.concat((p2t1, attributes), axis=1)
p2t1 = p2t1.dropna(subset=p2t1.columns[0:9])
p2t2 = pd.concat((p2t2, attributes), axis=1)
p2t2 = p2t2.dropna(subset=p2t2.columns[0:9])
p3t1 = pd.concat((p3t1, attributes), axis=1)
p3t1 = p3t1.dropna(subset=p3t1.columns[0:9])
p3t2 = pd.concat((p3t2, attributes), axis=1)
p3t2 = p3t2.dropna(subset=p3t2.columns[0:9])
print(p1t1)
print(p1t2)
# for viewing the correlations want a different display format
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# these indicies need to be edited because i'm trakcing training record length and time now
# but if you're grabbing training record length and time for a model that doesn't have it, you'll get an error.
# so you'll need to exclude those old results if you want to examine those correlations.
# maybe you can examine the correlation between eval nse and training record length in a separate script
# probably want a scatter plot for that.
p1t1corr = p1t1.corr().iloc[17:,:17]
#print(p1t1corr.to_string())
# show the strongest 20 p1t1corr
#print("strongest correlations in p1t1")
#print(p1t1corr.abs().unstack().sort_values(ascending=False).drop_duplicates()[0:20])
# show the strongest 20 correlations in p1t1, preserving their signs
print("strongest 20 correlations in p1t1")
print(p1t1corr.unstack().sort_values(ascending=False).drop_duplicates()[:30])
print(p1t1corr.unstack().sort_values(ascending=False).drop_duplicates()[-30:])
print("full p1t1 correlation matrix")
print(p1t1corr.to_string())
p1t2corr = p1t2.corr().iloc[23:,:23]
#print("full p1t2 correlation matrix")
#print(p1t2corr.to_string())
# show the strongest 20 p1t2corr
#print("strongest correlations in p1t2")
#print(p1t2corr.abs().unstack().sort_values(ascending=False).drop_duplicates()[0:20])
#print("strongest 20 correlations in p1t2")
#print(p1t2corr.unstack().sort_values(ascending=False).drop_duplicates()[:30])
#print(p1t2corr.unstack().sort_values(ascending=False).drop_duplicates()[-30:])