-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
367 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
print("hellow world") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"acc": 0.8621, "f1_score": 0.6489, "precision": 0.7358, "recall": 0.5803} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import json | ||
|
||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
import pandas as pd | ||
from sklearn.metrics import ConfusionMatrixDisplay, roc_curve | ||
|
||
|
||
def plot_confusion_matrix(model, X_test, y_test): | ||
_ = ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, cmap=plt.cm.Blues) | ||
plt.savefig("confusion_matrix.png") | ||
|
||
|
||
def save_metrics(metrics): | ||
with open("metrics.json", "w") as fp: | ||
json.dump(metrics, fp) | ||
|
||
|
||
def save_roc_curve(y_test, y_pred_proba): | ||
# Calcualte ROC curve | ||
fpr, tpr, _ = roc_curve(y_test, y_pred_proba) | ||
# Store roc curve data | ||
cdf = pd.DataFrame(np.column_stack([fpr, tpr]), columns=["fpr", "tpr"]).astype( | ||
float | ||
) | ||
cdf.to_csv("roc_curve.csv", index=None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import json | ||
import pickle | ||
import xgboost as xgb | ||
from sklearn.preprocessing import StandardScaler | ||
from sklearn.feature_extraction import DictVectorizer | ||
from sklearn.metrics import (accuracy_score, f1_score, | ||
precision_score, recall_score) | ||
from utils_and_constants import PARAMETERS | ||
|
||
def prep_data(X_train): | ||
|
||
categorical_col = X_train.dtypes[X_train.dtypes == 'object'].index.tolist() | ||
numerical_col = X_train.dtypes[X_train.dtypes == 'float64'].index.tolist() | ||
|
||
scaler = StandardScaler() | ||
X_train[numerical_col] = scaler.fit_transform(X_train[numerical_col]) | ||
|
||
vectorizer = DictVectorizer(sparse = False) | ||
vectorizer.fit(X_train[categorical_col + numerical_col].to_dict(orient = 'records')) | ||
X_train = vectorizer.transform(X_train[categorical_col + numerical_col].to_dict(orient = 'records')) | ||
return X_train, scaler, vectorizer | ||
|
||
def eval_metrics(y_true, prediction): | ||
|
||
f1 = f1_score(y_true, prediction) | ||
metrics = {"acc": accuracy_score(y_true, prediction), | ||
"f1_score": f1, | ||
"precision": precision_score(y_true, prediction), | ||
"recall": recall_score(y_true, prediction)} | ||
return metrics | ||
|
||
def train_model(X_train, y_train): | ||
|
||
with open(PARAMETERS, 'r') as json_file: | ||
parameters = json.load(json_file) | ||
parameters['max_depth'] = int(parameters['max_depth']) | ||
|
||
X_train, scaler, vectorizer = prep_data(X_train) | ||
train_data = xgb.DMatrix(X_train, label=y_train) | ||
|
||
booster = xgb.train(params=parameters, | ||
dtrain=train_data, | ||
num_boost_round=1000, | ||
evals=[(train_data, 'validation')], | ||
early_stopping_rounds=200 | ||
) | ||
|
||
prediction0 = booster.predict(train_data) | ||
prediction = (prediction0 >=0.5).astype('int') | ||
metrics = eval_metrics(y_train, prediction) | ||
|
||
return booster, scaler, vectorizer, metrics | ||
|
||
|
||
def evaluate_model(model, scaler, vectorizer, X_test, | ||
y_test, float_precision=4): | ||
|
||
categorical_col = X_test.dtypes[X_test.dtypes == 'object'].index.tolist() | ||
numerical_col = X_test.dtypes[X_test.dtypes == 'float64'].index.tolist() | ||
|
||
X_test[numerical_col] = scaler.transform(X_test[numerical_col]) | ||
X_test = vectorizer.transform((X_test[categorical_col + numerical_col].to_dict(orient = 'records'))) | ||
X_test = xgb.DMatrix(X_test) | ||
|
||
y_proba = model.predict(X_test) | ||
prediction = (y_proba >=0.5).astype('int') | ||
metrics = eval_metrics(y_test, prediction) | ||
|
||
metrics = json.loads(json.dumps(metrics), | ||
parse_float=lambda x: round(float(x), float_precision)) | ||
return metrics,y_proba | ||
|
||
def save_model(model, scaler, vectorizer): | ||
|
||
model_name = '../models/model.pkl' | ||
with open(model_name, 'wb') as f: | ||
pickle.dump([model, scaler, vectorizer], f) | ||
print("Model saved successfully!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
from typing import List | ||
|
||
import pandas as pd | ||
from sklearn.impute import SimpleImputer | ||
|
||
from utils_and_constants import ( | ||
DROP_COLNAMES, | ||
PROCESSED_DATASET, | ||
RAW_DATASET, | ||
TARGET_COLUMN, | ||
) | ||
|
||
|
||
def read_dataset( | ||
filename: str, drop_columns: List[str], target_column: str | ||
) -> pd.DataFrame: | ||
""" | ||
Reads the raw data file and returns pandas dataframe | ||
Target column values are expected in binary format with Yes/No values | ||
Parameters: | ||
filename (str): raw data filename | ||
drop_columns (List[str]): column names that will be dropped | ||
target_column (str): name of target column | ||
Returns: | ||
pd.Dataframe: Target encoded dataframe | ||
""" | ||
df = pd.read_csv(filename).drop(columns=drop_columns) | ||
df.columns = df.columns.str.lower().str.replace(' ', '_') | ||
|
||
categorical_cols = df.dtypes[df.dtypes == 'object'].index.tolist() | ||
|
||
for col in categorical_cols: | ||
df[col] = df[col].str.lower().str.replace(' ', '_') | ||
df[target_column] = df[target_column].map({"yes": 1, "no": 0}) | ||
|
||
return df | ||
|
||
|
||
# def target_encode_categorical_features( | ||
# df: pd.DataFrame, categorical_columns: List[str], target_column: str | ||
# ) -> pd.DataFrame: | ||
# """ | ||
# Target encodes the categorical features of the dataframe | ||
|
||
# Parameters: | ||
# df (pd.Dataframe): Pandas dataframe containing features and targets | ||
# categorical_columns (List[str]): categorical column names that will be target encoded | ||
# target_column (str): name of target column | ||
|
||
# Returns: | ||
# pd.Dataframe: Target encoded dataframe | ||
# """ | ||
# encoded_data = df.copy() | ||
|
||
# # Iterate through categorical columns | ||
# for col in categorical_columns: | ||
# # Calculate mean target value for each category | ||
# encoding_map = df.groupby(col)[target_column].mean().to_dict() | ||
|
||
# # Apply target encoding | ||
# encoded_data[col] = encoded_data[col].map(encoding_map) | ||
|
||
# return encoded_data | ||
|
||
|
||
|
||
def impute_data(df_features: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Imputes numerical data to its mean value | ||
Parameters: | ||
filename (str): raw data filename | ||
drop_columns (List[str]): column names that will be dropped | ||
target_column (str): name of target column | ||
Returns: | ||
pd.Dataframe: Imputed and Scaled dataframe | ||
""" | ||
|
||
# Impute data with mean strategy | ||
numerical_imputer = SimpleImputer(strategy="mean") | ||
categorical_imputer = SimpleImputer(strategy="most_frequent") | ||
|
||
numerical_col = df_features.select_dtypes(exclude=['object']).columns.tolist() | ||
numerical_col.remove(TARGET_COLUMN) | ||
categorical_col = df_features.select_dtypes(include=['object']).columns.tolist() | ||
categorical_col.append(TARGET_COLUMN) | ||
|
||
df_features[numerical_col] = numerical_imputer.fit_transform(df_features[numerical_col].values) | ||
df_features[categorical_col] = categorical_imputer.fit_transform(df_features[categorical_col].values) | ||
|
||
return df_features | ||
|
||
|
||
def main(): | ||
# Read data | ||
weather = read_dataset( | ||
filename=RAW_DATASET, drop_columns=DROP_COLNAMES, target_column=TARGET_COLUMN | ||
) | ||
|
||
# Impute & Write processed dataset | ||
weather_features_processed = impute_data(weather) | ||
weather_features_processed.to_csv(PROCESSED_DATASET, index=None) | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import pickle | ||
import xgboost as xgb | ||
import streamlit as st | ||
|
||
# Loading model | ||
path = '../models/model.pkl' | ||
|
||
with open(path, 'rb') as f: | ||
model, scaler, vectorizer = pickle.load(f) | ||
|
||
numerical_col = ['mintemp', 'maxtemp', 'rainfall', 'evaporation', 'sunshine', | ||
'windgustspeed', 'windspeed9am', 'windspeed3pm', 'humidity9am', | ||
'humidity3pm', 'pressure9am', 'pressure3pm', 'cloud9am', | ||
'cloud3pm', 'temp9am', 'temp3pm'] | ||
|
||
def prediction(data): | ||
|
||
numerical_values = [data[col] for col in numerical_col] | ||
numerical_values = scaler.transform([numerical_values])[0] | ||
for col, value in zip(numerical_col, numerical_values): | ||
data[col] = value | ||
data = vectorizer.transform(data) | ||
data = xgb.DMatrix(data) | ||
prediction = model.predict(data) | ||
return prediction | ||
def main(): | ||
st.title('Rainfall Prediction API') | ||
|
||
#input variables | ||
dicts = {} | ||
location = st.selectbox('Location ', ('albury', 'badgeryscreek', 'cobar', 'coffsharbour', 'moree', | ||
'newcastle', 'norahhead', 'norfolkisland', 'penrith', 'richmond', | ||
'sydney', 'sydneyairport', 'waggawagga', 'williamtown', | ||
'wollongong', 'canberra', 'tuggeranong', 'mountginini', 'ballarat', | ||
'bendigo', 'sale', 'melbourneairport', 'melbourne', 'mildura', | ||
'nhil', 'portland', 'watsonia', 'dartmoor', 'brisbane', 'cairns', | ||
'goldcoast', 'townsville', 'adelaide', 'mountgambier', 'nuriootpa', | ||
'woomera', 'albany', 'witchcliffe', 'pearceraaf', 'perthairport', | ||
'perth', 'salmongums', 'walpole', 'hobart', 'launceston', | ||
'alicesprings', 'darwin', 'katherine', 'uluru')) | ||
|
||
mintemp = st.number_input('Minimum Temperature') | ||
maxtemp = st.number_input('Maximum Temperature') | ||
rainfall = st.number_input('Rainfall Intensity') | ||
evaporation = st.number_input('Evaporation Level') | ||
sunshine = st.number_input('Sunshine Intensity') | ||
|
||
windgustdir = st.selectbox('Wind Direction',('w', 'wnw', 'wsw', 'ne', 'nnw', 'n', 'nne', 'sw', 'ene', | ||
'sse', 's', 'nw', 'se', 'ese', 'e', 'ssw')) | ||
|
||
windgustspeed = st.number_input('Wind Speed') | ||
winddir9am = st.selectbox('9am Wind Direction', ('w', 'nnw', 'se', 'ene', 'sw', 'sse', 's', 'ne', 'n', | ||
'ssw', 'wsw', 'ese', 'e', 'nw', 'wnw', 'nne')) | ||
winddir3pm = st.selectbox('3pm Wind Direction', ('wnw', 'wsw', 'e', 'nw', 'w', 'sse', 'ese', 'ene', 'nnw', | ||
'ssw', 'sw', 'se', 'n', 's', 'nne', 'ne')) | ||
|
||
windspeed9am = st.number_input('9am Wind Speed') | ||
windspeed3pm = st.number_input('3pm Wind Speed') | ||
humidity9am = st.number_input('9am Humidity Level') | ||
humidity3pm = st.number_input('3pm Humidity Level') | ||
pressure9am = st.number_input('9am Pressure Level') | ||
pressure3pm = st.number_input('3pm Pressure Level') | ||
cloud9am = st.number_input('9am Cloud Level') | ||
cloud3pm = st.number_input('3pm Cloud Level') | ||
temp9am = st.number_input('9am Temperatue Level') | ||
temp3pm = st.number_input('3pm Temperatue Level') | ||
raintoday = st.selectbox('Did it rain today', ('no', 'yes')) | ||
|
||
keys = ['location', 'mintemp', 'maxtemp', 'rainfall', 'evaporation', 'sunshine', | ||
'windgustdir', 'windgustspeed', 'winddir9am', 'winddir3pm', | ||
'windspeed9am', 'windspeed3pm', 'humidity9am', 'humidity3pm', | ||
'pressure9am', 'pressure3pm', 'cloud9am', 'cloud3pm', 'temp9am', | ||
'temp3pm', 'raintoday'] | ||
|
||
values = [location, mintemp, maxtemp, rainfall, evaporation, sunshine, | ||
windgustdir, windgustspeed, winddir9am, winddir3pm, windspeed9am, | ||
windspeed3pm,humidity9am, humidity3pm, pressure9am, pressure3pm, | ||
cloud9am, cloud3pm, temp9am, temp3pm, raintoday] | ||
|
||
for key,value in zip(keys, values): | ||
dicts[key] = value | ||
|
||
|
||
if st.button('Predict'): | ||
pred = prediction(dicts) | ||
|
||
if pred < 0.5: | ||
output = "There won't be rain tomorrow" | ||
else: | ||
output = "It will rain tomorrow" | ||
|
||
st.success(output) | ||
|
||
if __name__ == "__main__": | ||
main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Importing Libraries | ||
print("Importing Libraries") | ||
import json | ||
import pandas as pd | ||
from sklearn.model_selection import train_test_split | ||
|
||
from model import evaluate_model, train_model, save_model | ||
from metrics import save_metrics, save_roc_curve | ||
from utils_and_constants import PROCESSED_DATASET, TARGET_COLUMN | ||
|
||
|
||
def load_data(file_path): | ||
data = pd.read_csv(file_path) | ||
X = data.drop(TARGET_COLUMN, axis=1) | ||
y = data[TARGET_COLUMN] | ||
return X, y | ||
|
||
def main(): | ||
|
||
X, y = load_data(PROCESSED_DATASET) | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1993) | ||
|
||
model, scaler, vectorizer, metrics = train_model(X_train, y_train) | ||
metrics, y_proba = evaluate_model(model, scaler, | ||
vectorizer, X_test, y_test) | ||
|
||
print("====================Test Set Metrics==================") | ||
print(json.dumps(metrics, indent=2)) | ||
print("======================================================") | ||
|
||
save_metrics(metrics) | ||
save_roc_curve(y_test, y_proba) | ||
save_model(model, scaler, vectorizer) | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import shutil | ||
from pathlib import Path | ||
|
||
DATASET_TYPES = ["test", "train"] | ||
DROP_COLNAMES = ["Date"] | ||
TARGET_COLUMN = "raintomorrow" | ||
RAW_DATASET = "../raw_data/weatherAUS.csv" | ||
PROCESSED_DATASET = "../processed_data/weatherAUS.csv" | ||
PARAMETERS = "../parameters.json" | ||
|
||
|
||
def delete_and_recreate_dir(path): | ||
try: | ||
shutil.rmtree(path) | ||
except: | ||
pass | ||
finally: | ||
Path(path).mkdir(parents=True, exist_ok=True) | ||
|