Skip to content

Commit

Permalink
Change House Price Model to Random Forest Regressor with Regularization
Browse files Browse the repository at this point in the history
  • Loading branch information
yashasvini121 authored Oct 26, 2024
2 parents 33f86d8 + b6234e0 commit a6cf6f8
Show file tree
Hide file tree
Showing 5 changed files with 201 additions and 4 deletions.
196 changes: 196 additions & 0 deletions models/house_price/ImprovedModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import warnings
import pickle
from .ModelEvaluation import ModelEvaluation
import os
import logging
import streamlit as st
import numpy as np
warnings.filterwarnings("ignore")

# Define the directory for logs
log_directory = 'models/house_price/logs'
os.makedirs(log_directory, exist_ok=True) # Create the directory if it doesn't exist

# Set up logging
log_file = os.path.join(log_directory, 'model_training.log')
logging.basicConfig(
filename=log_file,
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)

df = pd.read_csv("models/house_price/data/housing.csv")
original_df = df.copy(deep=True)

# Target and Feature Identification
target = "price"
features = [col for col in df.columns if col != target]

# Separates numerical and categorical features based on unique values
nu = df[features].nunique()
numerical_features = [col for col in features if nu[col] > 16]
categorical_features = [col for col in features if nu[col] <= 16]

# Removing outliers using IQR
def remove_outliers(df, numerical_features):
for feature in numerical_features:
Q1 = df[feature].quantile(0.25)
Q3 = df[feature].quantile(0.75)
IQR = Q3 - Q1
df = df[(df[feature] >= (Q1 - 1.5 * IQR)) & (df[feature] <= (Q3 + 1.5 * IQR))]
return df.reset_index(drop=True)


# Handling missing values
def handle_missing_values(df):
null_summary = df.isnull().sum()
null_percentage = (null_summary / df.shape[0]) * 100
return pd.DataFrame(
{"Total Null Values": null_summary, "Percentage": null_percentage}
).sort_values(by="Percentage", ascending=False)


# Removes outliers from numerical features
df = remove_outliers(df, numerical_features)

# Filters categorical features without missing values
null_value_summary = handle_missing_values(df)
valid_categorical_features = [
col
for col in categorical_features
if col not in null_value_summary[null_value_summary["Percentage"] != 0].index
]

# Encoding categorical features
def encode_categorical_features(df, categorical_features):
for feature in categorical_features:
# Binary encoding for features with 2 unique values
if df[feature].nunique() == 2:
df[feature] = pd.get_dummies(df[feature], drop_first=True, prefix=feature)
# Dummy encoding for features with more than 2 unique values
elif 2 < df[feature].nunique() <= 16:
df = pd.concat(
[
df.drop([feature], axis=1),
pd.get_dummies(df[feature], drop_first=True, prefix=feature),
],
axis=1,
)
return df

df = encode_categorical_features(df, valid_categorical_features)

# Renames columns to avoid invalid characters
df.columns = [col.replace("-", "_").replace(" ", "_") for col in df.columns]

# Splitting the data into training & testing sets
X = df.drop([target], axis=1)
Y = df[target]
Train_X, Test_X, Train_Y, Test_Y = train_test_split(
X, Y, train_size=0.8, test_size=0.2, random_state=100
)

# Feature Scaling (Standardization)
std = StandardScaler()
Train_X_std = pd.DataFrame(std.fit_transform(Train_X), columns=X.columns)
Test_X_std = pd.DataFrame(std.transform(Test_X), columns=X.columns)

#Random Forest Algorithm
rf_model = RandomForestRegressor(random_state=42, n_estimators=200, max_depth=8, min_samples_split=12)
rf_model.fit(Train_X_std, Train_Y)


pred_train = rf_model.predict(Train_X_std)
pred_test = rf_model.predict(Test_X_std)

# Calculate RMSE for train and test sets
# train_rmse = np.sqrt(mean_squared_error(Train_Y, pred_train))
# test_rmse = np.sqrt(mean_squared_error(Test_Y, pred_test))


def prepare_input_data(
area,
mainroad,
guestroom,
basement,
hotwaterheating,
airconditioning,
prefarea,
additional_bedrooms,
bathrooms,
stories,
parking,
furnishingstatus,
):
# Creates a dictionary for the input features
input_data = {
"area": [area],
"mainroad": True if mainroad == "Yes" else False,
"guestroom": True if guestroom == "Yes" else False,
"basement": True if basement == "Yes" else False,
"hotwaterheating": True if hotwaterheating == "Yes" else False,
"airconditioning": True if airconditioning == "Yes" else False,
"prefarea": True if prefarea == "Yes" else False,
"bedrooms_2": additional_bedrooms == 2,
"bedrooms_3": additional_bedrooms == 3,
"bedrooms_4": additional_bedrooms == 4,
"bedrooms_5": additional_bedrooms == 5,
"bedrooms_6": additional_bedrooms == 6,
"bathrooms_2": bathrooms == 2,
"bathrooms_3": bathrooms == 3,
"bathrooms_4": bathrooms == 4,
"stories_2": stories == 2,
"stories_3": stories == 3,
"stories_4": stories == 4,
"parking_1": parking == 1,
"parking_2": parking == 2,
"parking_3": parking == 3,
"furnishingstatus_semi_furnished": furnishingstatus == "semi_furnished",
"furnishingstatus_unfurnished": furnishingstatus == "unfurnished",
}

return pd.DataFrame(input_data)

# Note: Not removing this fxn because of the warning in predict.py file


### Final Endpoint ###
def get_predicted(area=0, mainroad=False, guestroom=False, basement=False, hotwaterheating=False,
airconditioning=False, prefarea=False,bedrooms=0, bathrooms=2,stories=1, parking=1,
furnishingstatus="semi_furnished",):

input_df = prepare_input_data(area, mainroad, guestroom,basement, hotwaterheating, airconditioning, prefarea,
bedrooms, bathrooms, stories, parking, furnishingstatus)

input_std = pd.DataFrame(std.transform(input_df), columns=input_df.columns)
predicted_price = rf_model.predict(input_std)
return round(predicted_price[0],2)

def save_model():
# todo: Ask the user for the model name, and warn that the model will be overwritten

with open("./saved_models/model_02.pkl", "wb") as file:
pickle.dump(rf_model, file)


def save_scaler():
with open("./saved_models/scaler_02.pkl", "wb") as file:
pickle.dump(std, file)


def get_evaluator():
evaluator = ModelEvaluation(rf_model, Train_X_std, Train_Y, Test_X_std, Test_Y)
return evaluator

if __name__ == "__main__":
save_model()
save_scaler()
# model_evaluation()
7 changes: 4 additions & 3 deletions models/house_price/predict.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pickle
import pandas as pd
from models.house_price.model import get_evaluator
# from models.house_price.model import get_evaluator
from models.house_price.ImprovedModel import get_evaluator

"""
Predict.py file:
Expand Down Expand Up @@ -102,8 +103,8 @@ def get_prediction(
)

# Load the model and the scaler
model = load_model("models/house_price/saved_models/model_01.pkl")
scaler = load_model("models/house_price/saved_models/scaler_01.pkl")
model = load_model("models/house_price/saved_models/model_02.pkl")
scaler = load_model("models/house_price/saved_models/scaler_02.pkl")

# Scale the input data
input_scaled = scaler.transform(input_df)
Expand Down
Binary file added models/house_price/saved_models/model_02.pkl
Binary file not shown.
Binary file added models/house_price/saved_models/scaler_02.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion page_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def render_model_details(self, model_module,tab):
if model_details_function:
metrics, prediction_plot, error_plot, performance_plot = model_details_function().evaluate()

st.subheader(f"Model Accuracy: {metrics['Test_R2']:.2%}")
st.subheader(f"Model Accuracy: {metrics['Test_R2']:.2f}")

#mentioning the title of the scores
st.subheader(f"Scores: Training: {metrics['Train_R2']:.2f}, Testing: {metrics['Test_R2']:.2f}")
Expand Down

0 comments on commit a6cf6f8

Please sign in to comment.