Skip to content

Commit

Permalink
script revamping
Browse files Browse the repository at this point in the history
  • Loading branch information
Godwin-T committed Dec 19, 2023
1 parent fa71fec commit 0aefc4e
Show file tree
Hide file tree
Showing 8 changed files with 367 additions and 0 deletions.
1 change: 1 addition & 0 deletions scripts/hello.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
print("hellow world")
1 change: 1 addition & 0 deletions scripts/metrics.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"acc": 0.8621, "f1_score": 0.6489, "precision": 0.7358, "recall": 0.5803}
26 changes: 26 additions & 0 deletions scripts/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import ConfusionMatrixDisplay, roc_curve


def plot_confusion_matrix(model, X_test, y_test):
_ = ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, cmap=plt.cm.Blues)
plt.savefig("confusion_matrix.png")


def save_metrics(metrics):
with open("metrics.json", "w") as fp:
json.dump(metrics, fp)


def save_roc_curve(y_test, y_pred_proba):
# Calcualte ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
# Store roc curve data
cdf = pd.DataFrame(np.column_stack([fpr, tpr]), columns=["fpr", "tpr"]).astype(
float
)
cdf.to_csv("roc_curve.csv", index=None)
78 changes: 78 additions & 0 deletions scripts/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import json
import pickle
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import (accuracy_score, f1_score,
precision_score, recall_score)
from utils_and_constants import PARAMETERS

def prep_data(X_train):

categorical_col = X_train.dtypes[X_train.dtypes == 'object'].index.tolist()
numerical_col = X_train.dtypes[X_train.dtypes == 'float64'].index.tolist()

scaler = StandardScaler()
X_train[numerical_col] = scaler.fit_transform(X_train[numerical_col])

vectorizer = DictVectorizer(sparse = False)
vectorizer.fit(X_train[categorical_col + numerical_col].to_dict(orient = 'records'))
X_train = vectorizer.transform(X_train[categorical_col + numerical_col].to_dict(orient = 'records'))
return X_train, scaler, vectorizer

def eval_metrics(y_true, prediction):

f1 = f1_score(y_true, prediction)
metrics = {"acc": accuracy_score(y_true, prediction),
"f1_score": f1,
"precision": precision_score(y_true, prediction),
"recall": recall_score(y_true, prediction)}
return metrics

def train_model(X_train, y_train):

with open(PARAMETERS, 'r') as json_file:
parameters = json.load(json_file)
parameters['max_depth'] = int(parameters['max_depth'])

X_train, scaler, vectorizer = prep_data(X_train)
train_data = xgb.DMatrix(X_train, label=y_train)

booster = xgb.train(params=parameters,
dtrain=train_data,
num_boost_round=1000,
evals=[(train_data, 'validation')],
early_stopping_rounds=200
)

prediction0 = booster.predict(train_data)
prediction = (prediction0 >=0.5).astype('int')
metrics = eval_metrics(y_train, prediction)

return booster, scaler, vectorizer, metrics


def evaluate_model(model, scaler, vectorizer, X_test,
y_test, float_precision=4):

categorical_col = X_test.dtypes[X_test.dtypes == 'object'].index.tolist()
numerical_col = X_test.dtypes[X_test.dtypes == 'float64'].index.tolist()

X_test[numerical_col] = scaler.transform(X_test[numerical_col])
X_test = vectorizer.transform((X_test[categorical_col + numerical_col].to_dict(orient = 'records')))
X_test = xgb.DMatrix(X_test)

y_proba = model.predict(X_test)
prediction = (y_proba >=0.5).astype('int')
metrics = eval_metrics(y_test, prediction)

metrics = json.loads(json.dumps(metrics),
parse_float=lambda x: round(float(x), float_precision))
return metrics,y_proba

def save_model(model, scaler, vectorizer):

model_name = '../models/model.pkl'
with open(model_name, 'wb') as f:
pickle.dump([model, scaler, vectorizer], f)
print("Model saved successfully!")
108 changes: 108 additions & 0 deletions scripts/preprocess_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from typing import List

import pandas as pd
from sklearn.impute import SimpleImputer

from utils_and_constants import (
DROP_COLNAMES,
PROCESSED_DATASET,
RAW_DATASET,
TARGET_COLUMN,
)


def read_dataset(
filename: str, drop_columns: List[str], target_column: str
) -> pd.DataFrame:
"""
Reads the raw data file and returns pandas dataframe
Target column values are expected in binary format with Yes/No values
Parameters:
filename (str): raw data filename
drop_columns (List[str]): column names that will be dropped
target_column (str): name of target column
Returns:
pd.Dataframe: Target encoded dataframe
"""
df = pd.read_csv(filename).drop(columns=drop_columns)
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_cols = df.dtypes[df.dtypes == 'object'].index.tolist()

for col in categorical_cols:
df[col] = df[col].str.lower().str.replace(' ', '_')
df[target_column] = df[target_column].map({"yes": 1, "no": 0})

return df


# def target_encode_categorical_features(
# df: pd.DataFrame, categorical_columns: List[str], target_column: str
# ) -> pd.DataFrame:
# """
# Target encodes the categorical features of the dataframe

# Parameters:
# df (pd.Dataframe): Pandas dataframe containing features and targets
# categorical_columns (List[str]): categorical column names that will be target encoded
# target_column (str): name of target column

# Returns:
# pd.Dataframe: Target encoded dataframe
# """
# encoded_data = df.copy()

# # Iterate through categorical columns
# for col in categorical_columns:
# # Calculate mean target value for each category
# encoding_map = df.groupby(col)[target_column].mean().to_dict()

# # Apply target encoding
# encoded_data[col] = encoded_data[col].map(encoding_map)

# return encoded_data



def impute_data(df_features: pd.DataFrame) -> pd.DataFrame:
"""
Imputes numerical data to its mean value
Parameters:
filename (str): raw data filename
drop_columns (List[str]): column names that will be dropped
target_column (str): name of target column
Returns:
pd.Dataframe: Imputed and Scaled dataframe
"""

# Impute data with mean strategy
numerical_imputer = SimpleImputer(strategy="mean")
categorical_imputer = SimpleImputer(strategy="most_frequent")

numerical_col = df_features.select_dtypes(exclude=['object']).columns.tolist()
numerical_col.remove(TARGET_COLUMN)
categorical_col = df_features.select_dtypes(include=['object']).columns.tolist()
categorical_col.append(TARGET_COLUMN)

df_features[numerical_col] = numerical_imputer.fit_transform(df_features[numerical_col].values)
df_features[categorical_col] = categorical_imputer.fit_transform(df_features[categorical_col].values)

return df_features


def main():
# Read data
weather = read_dataset(
filename=RAW_DATASET, drop_columns=DROP_COLNAMES, target_column=TARGET_COLUMN
)

# Impute & Write processed dataset
weather_features_processed = impute_data(weather)
weather_features_processed.to_csv(PROCESSED_DATASET, index=None)

if __name__ == "__main__":
main()
98 changes: 98 additions & 0 deletions scripts/streamapp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import streamlit as st

# Loading model
path = '../models/model.pkl'

with open(path, 'rb') as f:
model, scaler, vectorizer = pickle.load(f)

numerical_col = ['mintemp', 'maxtemp', 'rainfall', 'evaporation', 'sunshine',
'windgustspeed', 'windspeed9am', 'windspeed3pm', 'humidity9am',
'humidity3pm', 'pressure9am', 'pressure3pm', 'cloud9am',
'cloud3pm', 'temp9am', 'temp3pm']

def prediction(data):

numerical_values = [data[col] for col in numerical_col]
numerical_values = scaler.transform([numerical_values])[0]
for col, value in zip(numerical_col, numerical_values):
data[col] = value
data = vectorizer.transform(data)
data = xgb.DMatrix(data)
prediction = model.predict(data)
return prediction
def main():
st.title('Rainfall Prediction API')

#input variables
dicts = {}
location = st.selectbox('Location ', ('albury', 'badgeryscreek', 'cobar', 'coffsharbour', 'moree',
'newcastle', 'norahhead', 'norfolkisland', 'penrith', 'richmond',
'sydney', 'sydneyairport', 'waggawagga', 'williamtown',
'wollongong', 'canberra', 'tuggeranong', 'mountginini', 'ballarat',
'bendigo', 'sale', 'melbourneairport', 'melbourne', 'mildura',
'nhil', 'portland', 'watsonia', 'dartmoor', 'brisbane', 'cairns',
'goldcoast', 'townsville', 'adelaide', 'mountgambier', 'nuriootpa',
'woomera', 'albany', 'witchcliffe', 'pearceraaf', 'perthairport',
'perth', 'salmongums', 'walpole', 'hobart', 'launceston',
'alicesprings', 'darwin', 'katherine', 'uluru'))

mintemp = st.number_input('Minimum Temperature')
maxtemp = st.number_input('Maximum Temperature')
rainfall = st.number_input('Rainfall Intensity')
evaporation = st.number_input('Evaporation Level')
sunshine = st.number_input('Sunshine Intensity')

windgustdir = st.selectbox('Wind Direction',('w', 'wnw', 'wsw', 'ne', 'nnw', 'n', 'nne', 'sw', 'ene',
'sse', 's', 'nw', 'se', 'ese', 'e', 'ssw'))

windgustspeed = st.number_input('Wind Speed')
winddir9am = st.selectbox('9am Wind Direction', ('w', 'nnw', 'se', 'ene', 'sw', 'sse', 's', 'ne', 'n',
'ssw', 'wsw', 'ese', 'e', 'nw', 'wnw', 'nne'))
winddir3pm = st.selectbox('3pm Wind Direction', ('wnw', 'wsw', 'e', 'nw', 'w', 'sse', 'ese', 'ene', 'nnw',
'ssw', 'sw', 'se', 'n', 's', 'nne', 'ne'))

windspeed9am = st.number_input('9am Wind Speed')
windspeed3pm = st.number_input('3pm Wind Speed')
humidity9am = st.number_input('9am Humidity Level')
humidity3pm = st.number_input('3pm Humidity Level')
pressure9am = st.number_input('9am Pressure Level')
pressure3pm = st.number_input('3pm Pressure Level')
cloud9am = st.number_input('9am Cloud Level')
cloud3pm = st.number_input('3pm Cloud Level')
temp9am = st.number_input('9am Temperatue Level')
temp3pm = st.number_input('3pm Temperatue Level')
raintoday = st.selectbox('Did it rain today', ('no', 'yes'))

keys = ['location', 'mintemp', 'maxtemp', 'rainfall', 'evaporation', 'sunshine',
'windgustdir', 'windgustspeed', 'winddir9am', 'winddir3pm',
'windspeed9am', 'windspeed3pm', 'humidity9am', 'humidity3pm',
'pressure9am', 'pressure3pm', 'cloud9am', 'cloud3pm', 'temp9am',
'temp3pm', 'raintoday']

values = [location, mintemp, maxtemp, rainfall, evaporation, sunshine,
windgustdir, windgustspeed, winddir9am, winddir3pm, windspeed9am,
windspeed3pm,humidity9am, humidity3pm, pressure9am, pressure3pm,
cloud9am, cloud3pm, temp9am, temp3pm, raintoday]

for key,value in zip(keys, values):
dicts[key] = value


if st.button('Predict'):
pred = prediction(dicts)

if pred < 0.5:
output = "There won't be rain tomorrow"
else:
output = "It will rain tomorrow"

st.success(output)

if __name__ == "__main__":
main()

36 changes: 36 additions & 0 deletions scripts/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Importing Libraries
print("Importing Libraries")
import json
import pandas as pd
from sklearn.model_selection import train_test_split

from model import evaluate_model, train_model, save_model
from metrics import save_metrics, save_roc_curve
from utils_and_constants import PROCESSED_DATASET, TARGET_COLUMN


def load_data(file_path):
data = pd.read_csv(file_path)
X = data.drop(TARGET_COLUMN, axis=1)
y = data[TARGET_COLUMN]
return X, y

def main():

X, y = load_data(PROCESSED_DATASET)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1993)

model, scaler, vectorizer, metrics = train_model(X_train, y_train)
metrics, y_proba = evaluate_model(model, scaler,
vectorizer, X_test, y_test)

print("====================Test Set Metrics==================")
print(json.dumps(metrics, indent=2))
print("======================================================")

save_metrics(metrics)
save_roc_curve(y_test, y_proba)
save_model(model, scaler, vectorizer)

if __name__ == "__main__":
main()
19 changes: 19 additions & 0 deletions scripts/utils_and_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import shutil
from pathlib import Path

DATASET_TYPES = ["test", "train"]
DROP_COLNAMES = ["Date"]
TARGET_COLUMN = "raintomorrow"
RAW_DATASET = "../raw_data/weatherAUS.csv"
PROCESSED_DATASET = "../processed_data/weatherAUS.csv"
PARAMETERS = "../parameters.json"


def delete_and_recreate_dir(path):
try:
shutil.rmtree(path)
except:
pass
finally:
Path(path).mkdir(parents=True, exist_ok=True)

0 comments on commit 0aefc4e

Please sign in to comment.