-
Notifications
You must be signed in to change notification settings - Fork 7
/
soilDataForClassification.py
144 lines (125 loc) · 5.36 KB
/
soilDataForClassification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Trying to classify soil based on world soil data
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import argmax
# Load in csv dataset
df = pd.read_excel('HWSD_DATA.xlsx')
df = df[df['ISSOIL'].isin([1])] # Drop all the non-soils
df = df[df.SU_CODE90.notnull()] # Drop all non-classified soils.
# Get rid of all of the useless columns
df = df.iloc[:, 7:]
# Fill NaN values with medians.
df.fillna(df.median(), inplace=True)
# Add new feature of absolute soil class
df['SOIL_CLASS'] = [x[:2] for x in df['SU_SYM90']]
# Amount of each type of soil
# Add a new feature on most common grain size.
# Get rid of more columns I don't need
df = df.iloc[:, 4:]
'''
Below shows that most our soil types have sample sizes of <20 and are largely spread from
0 to 200, there is one soil type I which almost contains 1000 samples however.
'''
df['SU_SYM90'].unique()
x = df['SU_SYM90'].value_counts()
plt.hist(x, bins=20)
'''
Soil class value counts, similar to above but for absolute classes instead of minimum.
'''
df['SOIL_CLASS'].unique()
x = df['SOIL_CLASS'].value_counts()
plt.hist(x)
y = df['SOIL_CLASS']
'''
Below shows the distribution of drainage types with 1 being very poor drainage and
6 being somewhat excessive. The most common class is 4 which is a moderately-well drained
soil.
'''
plt.hist(df['DRAINAGE'], bins=6)
'''
Below shows the distribution of the texture types. 2 is the most common texture by
a significant margin, 2 is a medium texture.
'''
plt.hist(df['T_TEXTURE'], bins=4)
'''
Below shows the distributions of each type of soil particulate matter. Gravel is rarely
over 10% in soils while sand commonly makes up over 30% of a soil.
'''
sns.pairplot(data=df, vars=['T_CLAY', 'T_SILT', 'T_SAND', 'T_GRAVEL'])
plt.hist(df['T_CLAY'].dropna(), bins=20)
plt.hist(df['T_SILT'].dropna(), bins=20)
plt.hist(df['T_SAND'].dropna(), bins=20)
plt.hist(df['T_GRAVEL'].dropna(), bins=20)
# Encoding categorical variable for soil class
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y = y.reshape(-1, 1) # It needs reshaping for the onehotencoder to work.
# One hot encode the variable
onehot = OneHotEncoder(sparse=False)
y = onehot.fit_transform(y)
x = df.iloc[:, 2:-1]
columns = list(x.columns.values)
# Some feature scaling
#from sklearn.preprocessing import StandardScaler
#sc_x = StandardScaler()
#x = sc_x.fit_transform(x) #need to fit and then xform
#Rename columns with original names
#x.columns = columns
# Create x and y train and test sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# Random forest.
from sklearn.ensemble import RandomForestClassifier
rand_for = RandomForestClassifier()
rand_for.fit(X_train, y_train)
# Try model on the test set.
from sklearn.model_selection import cross_val_predict, cross_val_score
cross_val_score(rand_for, X_train, y_train, cv=3, scoring='accuracy')
# Performing a grid search to see if we can achieve a better model.
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [5, 6, 7], 'bootstrap': [False],
'max_features': [None]}
rand_for = RandomForestClassifier()
grid_search = GridSearchCV(rand_for, param_grid, cv=5,
scoring='accuracy')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
rand_for = RandomForestClassifier(max_features=None, n_estimators=5, bootstrap=False)
rand_for.fit(X_train, y_train)
# Check confusion matrix to see if the model is just selection most likely classes.
y_pred = rand_for.predict(X_test)
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
print('Recall: {}, Precision: {}, Accuracy: {}'.format(recall, precision, accuracy))
# What about if we do some feature selection
#from sklearn.feature_selection import SelectFromModel
#print(rand_for.feature_importances_)
#model = SelectFromModel(rand_for, prefit=True)
#X_train_new = model.transform(X_train)
#X_test_new = model.transform(X_test)
#new_rand_for = RandomForestClassifier(max_features=None, n_estimators=5, bootstrap=False)
#new_rand_for.fit(X_train_new, y_train)
# Feature selection with k selections
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_train_new = SelectKBest(chi2, k=12).fit_transform(X_train, y_train)
X_test_new = SelectKBest(chi2, k=12).fit_transform(X_test, y_test)
new_rand_for = RandomForestClassifier(max_features=None, n_estimators=5, bootstrap=False)
new_rand_for.fit(X_train_new, y_train)
y_pred = new_rand_for.predict(X_test_new)
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
print('Recall: {}, Precision: {}, Accuracy: {}'.format(recall, precision, accuracy))