Skip to content

Commit

Permalink
Merge branch 'master' into Insurance_Cost_Predictor
Browse files Browse the repository at this point in the history
  • Loading branch information
yashasvini121 authored Oct 26, 2024
2 parents 47a4c78 + a6cf6f8 commit 3a9e7b4
Show file tree
Hide file tree
Showing 8 changed files with 408 additions and 141 deletions.
342 changes: 206 additions & 136 deletions App.py

Large diffs are not rendered by default.

Binary file added assets/images/machine_learning.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
196 changes: 196 additions & 0 deletions models/house_price/ImprovedModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import warnings
import pickle
from .ModelEvaluation import ModelEvaluation
import os
import logging
import streamlit as st
import numpy as np
warnings.filterwarnings("ignore")

# Define the directory for logs
log_directory = 'models/house_price/logs'
os.makedirs(log_directory, exist_ok=True) # Create the directory if it doesn't exist

# Set up logging
log_file = os.path.join(log_directory, 'model_training.log')
logging.basicConfig(
filename=log_file,
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)

df = pd.read_csv("models/house_price/data/housing.csv")
original_df = df.copy(deep=True)

# Target and Feature Identification
target = "price"
features = [col for col in df.columns if col != target]

# Separates numerical and categorical features based on unique values
nu = df[features].nunique()
numerical_features = [col for col in features if nu[col] > 16]
categorical_features = [col for col in features if nu[col] <= 16]

# Removing outliers using IQR
def remove_outliers(df, numerical_features):
for feature in numerical_features:
Q1 = df[feature].quantile(0.25)
Q3 = df[feature].quantile(0.75)
IQR = Q3 - Q1
df = df[(df[feature] >= (Q1 - 1.5 * IQR)) & (df[feature] <= (Q3 + 1.5 * IQR))]
return df.reset_index(drop=True)


# Handling missing values
def handle_missing_values(df):
null_summary = df.isnull().sum()
null_percentage = (null_summary / df.shape[0]) * 100
return pd.DataFrame(
{"Total Null Values": null_summary, "Percentage": null_percentage}
).sort_values(by="Percentage", ascending=False)


# Removes outliers from numerical features
df = remove_outliers(df, numerical_features)

# Filters categorical features without missing values
null_value_summary = handle_missing_values(df)
valid_categorical_features = [
col
for col in categorical_features
if col not in null_value_summary[null_value_summary["Percentage"] != 0].index
]

# Encoding categorical features
def encode_categorical_features(df, categorical_features):
for feature in categorical_features:
# Binary encoding for features with 2 unique values
if df[feature].nunique() == 2:
df[feature] = pd.get_dummies(df[feature], drop_first=True, prefix=feature)
# Dummy encoding for features with more than 2 unique values
elif 2 < df[feature].nunique() <= 16:
df = pd.concat(
[
df.drop([feature], axis=1),
pd.get_dummies(df[feature], drop_first=True, prefix=feature),
],
axis=1,
)
return df

df = encode_categorical_features(df, valid_categorical_features)

# Renames columns to avoid invalid characters
df.columns = [col.replace("-", "_").replace(" ", "_") for col in df.columns]

# Splitting the data into training & testing sets
X = df.drop([target], axis=1)
Y = df[target]
Train_X, Test_X, Train_Y, Test_Y = train_test_split(
X, Y, train_size=0.8, test_size=0.2, random_state=100
)

# Feature Scaling (Standardization)
std = StandardScaler()
Train_X_std = pd.DataFrame(std.fit_transform(Train_X), columns=X.columns)
Test_X_std = pd.DataFrame(std.transform(Test_X), columns=X.columns)

#Random Forest Algorithm
rf_model = RandomForestRegressor(random_state=42, n_estimators=200, max_depth=8, min_samples_split=12)
rf_model.fit(Train_X_std, Train_Y)


pred_train = rf_model.predict(Train_X_std)
pred_test = rf_model.predict(Test_X_std)

# Calculate RMSE for train and test sets
# train_rmse = np.sqrt(mean_squared_error(Train_Y, pred_train))
# test_rmse = np.sqrt(mean_squared_error(Test_Y, pred_test))


def prepare_input_data(
area,
mainroad,
guestroom,
basement,
hotwaterheating,
airconditioning,
prefarea,
additional_bedrooms,
bathrooms,
stories,
parking,
furnishingstatus,
):
# Creates a dictionary for the input features
input_data = {
"area": [area],
"mainroad": True if mainroad == "Yes" else False,
"guestroom": True if guestroom == "Yes" else False,
"basement": True if basement == "Yes" else False,
"hotwaterheating": True if hotwaterheating == "Yes" else False,
"airconditioning": True if airconditioning == "Yes" else False,
"prefarea": True if prefarea == "Yes" else False,
"bedrooms_2": additional_bedrooms == 2,
"bedrooms_3": additional_bedrooms == 3,
"bedrooms_4": additional_bedrooms == 4,
"bedrooms_5": additional_bedrooms == 5,
"bedrooms_6": additional_bedrooms == 6,
"bathrooms_2": bathrooms == 2,
"bathrooms_3": bathrooms == 3,
"bathrooms_4": bathrooms == 4,
"stories_2": stories == 2,
"stories_3": stories == 3,
"stories_4": stories == 4,
"parking_1": parking == 1,
"parking_2": parking == 2,
"parking_3": parking == 3,
"furnishingstatus_semi_furnished": furnishingstatus == "semi_furnished",
"furnishingstatus_unfurnished": furnishingstatus == "unfurnished",
}

return pd.DataFrame(input_data)

# Note: Not removing this fxn because of the warning in predict.py file


### Final Endpoint ###
def get_predicted(area=0, mainroad=False, guestroom=False, basement=False, hotwaterheating=False,
airconditioning=False, prefarea=False,bedrooms=0, bathrooms=2,stories=1, parking=1,
furnishingstatus="semi_furnished",):

input_df = prepare_input_data(area, mainroad, guestroom,basement, hotwaterheating, airconditioning, prefarea,
bedrooms, bathrooms, stories, parking, furnishingstatus)

input_std = pd.DataFrame(std.transform(input_df), columns=input_df.columns)
predicted_price = rf_model.predict(input_std)
return round(predicted_price[0],2)

def save_model():
# todo: Ask the user for the model name, and warn that the model will be overwritten

with open("./saved_models/model_02.pkl", "wb") as file:
pickle.dump(rf_model, file)


def save_scaler():
with open("./saved_models/scaler_02.pkl", "wb") as file:
pickle.dump(std, file)


def get_evaluator():
evaluator = ModelEvaluation(rf_model, Train_X_std, Train_Y, Test_X_std, Test_Y)
return evaluator

if __name__ == "__main__":
save_model()
save_scaler()
# model_evaluation()
7 changes: 4 additions & 3 deletions models/house_price/predict.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pickle
import pandas as pd
from models.house_price.model import get_evaluator
# from models.house_price.model import get_evaluator
from models.house_price.ImprovedModel import get_evaluator

"""
Predict.py file:
Expand Down Expand Up @@ -102,8 +103,8 @@ def get_prediction(
)

# Load the model and the scaler
model = load_model("models/house_price/saved_models/model_01.pkl")
scaler = load_model("models/house_price/saved_models/scaler_01.pkl")
model = load_model("models/house_price/saved_models/model_02.pkl")
scaler = load_model("models/house_price/saved_models/scaler_02.pkl")

# Scale the input data
input_scaled = scaler.transform(input_df)
Expand Down
Binary file added models/house_price/saved_models/model_02.pkl
Binary file not shown.
Binary file added models/house_price/saved_models/scaler_02.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion page_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def render_model_details(self, model_module,tab):
if model_details_function:
metrics, prediction_plot, error_plot, performance_plot = model_details_function().evaluate()

st.subheader(f"Model Accuracy: {metrics['Test_R2']:.2%}")
st.subheader(f"Model Accuracy: {metrics['Test_R2']:.2f}")

#mentioning the title of the scores
st.subheader(f"Scores: Training: {metrics['Train_R2']:.2f}, Testing: {metrics['Test_R2']:.2f}")
Expand Down
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ The project has been successfully tested in local environments, and current effo
2. Or create new issues to discuss new ideas, suggest features, or report bugs.
3. Fork the repository and create a new branch for your contribution.
4. Implement your changes and submit a pull request with a clear description.
5. Futher details can be found in the [contributing.md](contributing.md) file.
5. Further details can be found in the [contributing.md](contributing.md) file.

## Setup Instructions
1. Fork or clone the repository.
Expand Down

0 comments on commit 3a9e7b4

Please sign in to comment.