script revamping

Godwin-T · Dec 19, 2023 · 0aefc4e · 0aefc4e
1 parent fa71fec
commit 0aefc4e
Show file tree

Hide file tree

Showing 8 changed files with 367 additions and 0 deletions.
diff --git a/scripts/hello.py b/scripts/hello.py
@@ -0,0 +1 @@
+print("hellow world")
diff --git a/scripts/metrics.json b/scripts/metrics.json
@@ -0,0 +1 @@
+{"acc": 0.8621, "f1_score": 0.6489, "precision": 0.7358, "recall": 0.5803}
diff --git a/scripts/metrics.py b/scripts/metrics.py
@@ -0,0 +1,26 @@
+import json
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from sklearn.metrics import ConfusionMatrixDisplay, roc_curve
+
+
+def plot_confusion_matrix(model, X_test, y_test):
+    _ = ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, cmap=plt.cm.Blues)
+    plt.savefig("confusion_matrix.png")
+
+
+def save_metrics(metrics):
+    with open("metrics.json", "w") as fp:
+        json.dump(metrics, fp)
+
+
+def save_roc_curve(y_test, y_pred_proba):
+    # Calcualte ROC curve
+    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
+    # Store roc curve data
+    cdf = pd.DataFrame(np.column_stack([fpr, tpr]), columns=["fpr", "tpr"]).astype(
+        float
+    )
+    cdf.to_csv("roc_curve.csv", index=None)
diff --git a/scripts/model.py b/scripts/model.py
@@ -0,0 +1,78 @@
+import json
+import pickle
+import xgboost as xgb
+from sklearn.preprocessing import StandardScaler
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.metrics import (accuracy_score, f1_score, 
+                             precision_score, recall_score)
+from utils_and_constants import PARAMETERS
+
+def prep_data(X_train):
+
+    categorical_col = X_train.dtypes[X_train.dtypes == 'object'].index.tolist()
+    numerical_col = X_train.dtypes[X_train.dtypes == 'float64'].index.tolist()
+
+    scaler = StandardScaler()
+    X_train[numerical_col] = scaler.fit_transform(X_train[numerical_col])
+
+    vectorizer = DictVectorizer(sparse = False)
+    vectorizer.fit(X_train[categorical_col + numerical_col].to_dict(orient = 'records'))
+    X_train = vectorizer.transform(X_train[categorical_col + numerical_col].to_dict(orient = 'records'))
+    return X_train, scaler, vectorizer
+
+def eval_metrics(y_true, prediction):
+
+    f1 = f1_score(y_true, prediction)
+    metrics = {"acc": accuracy_score(y_true, prediction), 
+              "f1_score": f1, 
+              "precision": precision_score(y_true, prediction), 
+              "recall": recall_score(y_true, prediction)}
+    return metrics
+
+def train_model(X_train, y_train):
+
+    with open(PARAMETERS, 'r') as json_file:
+        parameters = json.load(json_file)
+        parameters['max_depth'] = int(parameters['max_depth'])
+
+    X_train, scaler, vectorizer = prep_data(X_train)
+    train_data = xgb.DMatrix(X_train, label=y_train)
+
+    booster = xgb.train(params=parameters,
+                        dtrain=train_data,
+                        num_boost_round=1000,
+                        evals=[(train_data, 'validation')],
+                        early_stopping_rounds=200
+                        )
+
+    prediction0 = booster.predict(train_data)
+    prediction = (prediction0 >=0.5).astype('int')
+    metrics = eval_metrics(y_train, prediction)
+
+    return booster, scaler, vectorizer, metrics
+
+
+def evaluate_model(model, scaler, vectorizer, X_test, 
+                   y_test, float_precision=4):
+
+    categorical_col = X_test.dtypes[X_test.dtypes == 'object'].index.tolist()
+    numerical_col = X_test.dtypes[X_test.dtypes == 'float64'].index.tolist()
+
+    X_test[numerical_col] = scaler.transform(X_test[numerical_col])
+    X_test = vectorizer.transform((X_test[categorical_col + numerical_col].to_dict(orient = 'records')))
+    X_test = xgb.DMatrix(X_test)
+
+    y_proba = model.predict(X_test)
+    prediction = (y_proba >=0.5).astype('int')
+    metrics = eval_metrics(y_test, prediction)
+
+    metrics = json.loads(json.dumps(metrics), 
+                         parse_float=lambda x: round(float(x), float_precision))
+    return metrics,y_proba
+
+def save_model(model, scaler, vectorizer):
+
+    model_name = '../models/model.pkl'
+    with open(model_name, 'wb') as f:
+        pickle.dump([model, scaler, vectorizer], f)
+    print("Model saved successfully!")
diff --git a/scripts/preprocess_data.py b/scripts/preprocess_data.py
@@ -0,0 +1,108 @@
+from typing import List
+
+import pandas as pd
+from sklearn.impute import SimpleImputer
+
+from utils_and_constants import (
+    DROP_COLNAMES,
+    PROCESSED_DATASET,
+    RAW_DATASET,
+    TARGET_COLUMN,
+)
+
+
+def read_dataset(
+    filename: str, drop_columns: List[str], target_column: str
+) -> pd.DataFrame:
+    """
+    Reads the raw data file and returns pandas dataframe
+    Target column values are expected in binary format with Yes/No values
+
+    Parameters:
+    filename (str): raw data filename
+    drop_columns (List[str]): column names that will be dropped
+    target_column (str): name of target column
+
+    Returns:
+    pd.Dataframe: Target encoded dataframe
+    """
+    df = pd.read_csv(filename).drop(columns=drop_columns)
+    df.columns = df.columns.str.lower().str.replace(' ', '_')
+
+    categorical_cols = df.dtypes[df.dtypes == 'object'].index.tolist()
+
+    for col in categorical_cols:
+        df[col] = df[col].str.lower().str.replace(' ', '_')
+    df[target_column] = df[target_column].map({"yes": 1, "no": 0})
+
+    return df
+
+
+# def target_encode_categorical_features(
+#     df: pd.DataFrame, categorical_columns: List[str], target_column: str
+# ) -> pd.DataFrame:
+#     """
+#     Target encodes the categorical features of the dataframe
+
+#     Parameters:
+#     df (pd.Dataframe): Pandas dataframe containing features and targets
+#     categorical_columns (List[str]): categorical column names that will be target encoded
+#     target_column (str): name of target column
+
+#     Returns:
+#     pd.Dataframe: Target encoded dataframe
+#     """
+#     encoded_data = df.copy()
+
+#     # Iterate through categorical columns
+#     for col in categorical_columns:
+#         # Calculate mean target value for each category
+#         encoding_map = df.groupby(col)[target_column].mean().to_dict()
+
+#         # Apply target encoding
+#         encoded_data[col] = encoded_data[col].map(encoding_map)
+
+#     return encoded_data
+
+
+
+def impute_data(df_features: pd.DataFrame) -> pd.DataFrame:
+    """
+    Imputes numerical data to its mean value
+    
+    Parameters:
+    filename (str): raw data filename
+    drop_columns (List[str]): column names that will be dropped
+    target_column (str): name of target column
+
+    Returns:
+    pd.Dataframe: Imputed and Scaled dataframe
+    """
+
+    # Impute data with mean strategy
+    numerical_imputer = SimpleImputer(strategy="mean")
+    categorical_imputer = SimpleImputer(strategy="most_frequent")
+
+    numerical_col = df_features.select_dtypes(exclude=['object']).columns.tolist()
+    numerical_col.remove(TARGET_COLUMN)
+    categorical_col = df_features.select_dtypes(include=['object']).columns.tolist()
+    categorical_col.append(TARGET_COLUMN)
+
+    df_features[numerical_col] = numerical_imputer.fit_transform(df_features[numerical_col].values)
+    df_features[categorical_col] = categorical_imputer.fit_transform(df_features[categorical_col].values)
+
+    return df_features
+
+
+def main():
+    # Read data
+    weather = read_dataset(
+        filename=RAW_DATASET, drop_columns=DROP_COLNAMES, target_column=TARGET_COLUMN
+    )
+
+    # Impute & Write processed dataset
+    weather_features_processed = impute_data(weather)
+    weather_features_processed.to_csv(PROCESSED_DATASET, index=None)
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/streamapp.py b/scripts/streamapp.py
@@ -0,0 +1,98 @@
+import pandas as pd
+import numpy as np
+import pickle
+import xgboost as xgb
+import streamlit as st
+
+# Loading model
+path = '../models/model.pkl'
+
+with open(path, 'rb') as f:
+    model, scaler, vectorizer = pickle.load(f)
+
+numerical_col = ['mintemp', 'maxtemp', 'rainfall', 'evaporation', 'sunshine',
+                'windgustspeed', 'windspeed9am', 'windspeed3pm', 'humidity9am',
+                'humidity3pm', 'pressure9am', 'pressure3pm', 'cloud9am',
+                'cloud3pm', 'temp9am', 'temp3pm']
+
+def prediction(data):
+
+    numerical_values = [data[col] for col in numerical_col]
+    numerical_values = scaler.transform([numerical_values])[0]
+    for col, value in zip(numerical_col, numerical_values):
+        data[col] =  value
+    data = vectorizer.transform(data)
+    data = xgb.DMatrix(data)
+    prediction = model.predict(data)
+    return prediction
+def main():
+    st.title('Rainfall Prediction API')
+
+    #input variables
+    dicts = {}
+    location = st.selectbox('Location ', ('albury', 'badgeryscreek', 'cobar', 'coffsharbour', 'moree',
+                                        'newcastle', 'norahhead', 'norfolkisland', 'penrith', 'richmond',
+                                        'sydney', 'sydneyairport', 'waggawagga', 'williamtown',
+                                        'wollongong', 'canberra', 'tuggeranong', 'mountginini', 'ballarat',
+                                        'bendigo', 'sale', 'melbourneairport', 'melbourne', 'mildura',
+                                        'nhil', 'portland', 'watsonia', 'dartmoor', 'brisbane', 'cairns',
+                                        'goldcoast', 'townsville', 'adelaide', 'mountgambier', 'nuriootpa',
+                                        'woomera', 'albany', 'witchcliffe', 'pearceraaf', 'perthairport',
+                                        'perth', 'salmongums', 'walpole', 'hobart', 'launceston',
+                                        'alicesprings', 'darwin', 'katherine', 'uluru'))
+
+    mintemp = st.number_input('Minimum Temperature')
+    maxtemp = st.number_input('Maximum Temperature')
+    rainfall = st.number_input('Rainfall Intensity')
+    evaporation = st.number_input('Evaporation Level')
+    sunshine = st.number_input('Sunshine Intensity')
+
+    windgustdir = st.selectbox('Wind Direction',('w', 'wnw', 'wsw', 'ne', 'nnw', 'n', 'nne', 'sw', 'ene', 
+                                                 'sse', 's', 'nw', 'se', 'ese', 'e', 'ssw'))
+
+    windgustspeed = st.number_input('Wind Speed')
+    winddir9am = st.selectbox('9am Wind Direction', ('w', 'nnw', 'se', 'ene', 'sw', 'sse', 's', 'ne', 'n', 
+                                                     'ssw', 'wsw', 'ese', 'e', 'nw', 'wnw', 'nne'))
+    winddir3pm = st.selectbox('3pm Wind Direction', ('wnw', 'wsw', 'e', 'nw', 'w', 'sse', 'ese', 'ene', 'nnw', 
+                                                      'ssw', 'sw', 'se', 'n', 's', 'nne', 'ne'))
+
+    windspeed9am = st.number_input('9am Wind Speed')
+    windspeed3pm = st.number_input('3pm Wind Speed')
+    humidity9am = st.number_input('9am Humidity Level')
+    humidity3pm = st.number_input('3pm Humidity Level')
+    pressure9am = st.number_input('9am Pressure Level')
+    pressure3pm = st.number_input('3pm Pressure Level')
+    cloud9am = st.number_input('9am Cloud Level')
+    cloud3pm = st.number_input('3pm Cloud Level')
+    temp9am = st.number_input('9am Temperatue Level')
+    temp3pm = st.number_input('3pm Temperatue Level')
+    raintoday = st.selectbox('Did it rain today', ('no', 'yes'))
+
+    keys = ['location', 'mintemp', 'maxtemp', 'rainfall', 'evaporation', 'sunshine',
+            'windgustdir', 'windgustspeed', 'winddir9am', 'winddir3pm',
+            'windspeed9am', 'windspeed3pm', 'humidity9am', 'humidity3pm',
+            'pressure9am', 'pressure3pm', 'cloud9am', 'cloud3pm', 'temp9am',
+            'temp3pm', 'raintoday']
+
+    values = [location, mintemp, maxtemp, rainfall, evaporation, sunshine,
+               windgustdir, windgustspeed, winddir9am, winddir3pm, windspeed9am,
+               windspeed3pm,humidity9am, humidity3pm, pressure9am, pressure3pm, 
+               cloud9am, cloud3pm, temp9am, temp3pm, raintoday]
+
+    for key,value in zip(keys, values):
+        dicts[key] = value
+
+
+    if st.button('Predict'):
+        pred = prediction(dicts)
+
+        if pred < 0.5:
+            output = "There won't be rain tomorrow"
+        else:
+            output = "It will rain tomorrow"
+
+        st.success(output)
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/train.py b/scripts/train.py
@@ -0,0 +1,36 @@
+# Importing Libraries
+print("Importing Libraries")
+import json
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+from model import evaluate_model, train_model, save_model
+from metrics import save_metrics, save_roc_curve
+from utils_and_constants import PROCESSED_DATASET, TARGET_COLUMN
+
+
+def load_data(file_path):
+    data = pd.read_csv(file_path)
+    X = data.drop(TARGET_COLUMN, axis=1)
+    y = data[TARGET_COLUMN]
+    return X, y
+
+def main():
+
+    X, y = load_data(PROCESSED_DATASET)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1993)
+
+    model, scaler, vectorizer, metrics = train_model(X_train, y_train)
+    metrics, y_proba = evaluate_model(model, scaler,
+                                      vectorizer, X_test, y_test)
+
+    print("====================Test Set Metrics==================")
+    print(json.dumps(metrics, indent=2))
+    print("======================================================")
+
+    save_metrics(metrics)
+    save_roc_curve(y_test, y_proba)
+    save_model(model, scaler, vectorizer)
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/utils_and_constants.py b/scripts/utils_and_constants.py
@@ -0,0 +1,19 @@
+import shutil
+from pathlib import Path
+
+DATASET_TYPES = ["test", "train"]
+DROP_COLNAMES = ["Date"]
+TARGET_COLUMN = "raintomorrow"
+RAW_DATASET = "../raw_data/weatherAUS.csv"
+PROCESSED_DATASET = "../processed_data/weatherAUS.csv"
+PARAMETERS = "../parameters.json"
+
+
+def delete_and_recreate_dir(path):
+    try:
+        shutil.rmtree(path)
+    except:
+        pass
+    finally:
+        Path(path).mkdir(parents=True, exist_ok=True)
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"acc": 0.8621, "f1_score": 0.6489, "precision": 0.7358, "recall": 0.5803}