Change House Price Model to Random Forest Regressor with Regularization

yashasvini121 · Oct 26, 2024 · a6cf6f8 · a6cf6f8
2 parents 33f86d8 + b6234e0
commit a6cf6f8
Show file tree

Hide file tree

Showing 5 changed files with 201 additions and 4 deletions.
diff --git a/models/house_price/ImprovedModel.py b/models/house_price/ImprovedModel.py
@@ -0,0 +1,196 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.feature_selection import RFE
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+import warnings
+import pickle
+from .ModelEvaluation import ModelEvaluation
+import os
+import logging
+import streamlit as st
+import numpy as np
+warnings.filterwarnings("ignore")
+
+# Define the directory for logs
+log_directory = 'models/house_price/logs'
+os.makedirs(log_directory, exist_ok=True) # Create the directory if it doesn't exist
+
+# Set up logging
+log_file = os.path.join(log_directory, 'model_training.log')
+logging.basicConfig(
+	filename=log_file,
+	level=logging.INFO,
+	format='%(asctime)s - %(levelname)s - %(message)s'
+)
+
+df = pd.read_csv("models/house_price/data/housing.csv")
+original_df = df.copy(deep=True)
+
+# Target and Feature Identification
+target = "price"
+features = [col for col in df.columns if col != target]
+
+# Separates numerical and categorical features based on unique values
+nu = df[features].nunique()
+numerical_features = [col for col in features if nu[col] > 16]
+categorical_features = [col for col in features if nu[col] <= 16]
+
+# Removing outliers using IQR
+def remove_outliers(df, numerical_features):
+	for feature in numerical_features:
+		Q1 = df[feature].quantile(0.25)
+		Q3 = df[feature].quantile(0.75)
+		IQR = Q3 - Q1
+		df = df[(df[feature] >= (Q1 - 1.5 * IQR)) & (df[feature] <= (Q3 + 1.5 * IQR))]
+	return df.reset_index(drop=True)
+
+
+# Handling missing values
+def handle_missing_values(df):
+	null_summary = df.isnull().sum()
+	null_percentage = (null_summary / df.shape[0]) * 100
+	return pd.DataFrame(
+		{"Total Null Values": null_summary, "Percentage": null_percentage}
+	).sort_values(by="Percentage", ascending=False)
+
+
+# Removes outliers from numerical features
+df = remove_outliers(df, numerical_features)
+
+# Filters categorical features without missing values
+null_value_summary = handle_missing_values(df)
+valid_categorical_features = [
+	col
+	for col in categorical_features
+	if col not in null_value_summary[null_value_summary["Percentage"] != 0].index
+]
+
+# Encoding categorical features
+def encode_categorical_features(df, categorical_features):
+	for feature in categorical_features:
+		# Binary encoding for features with 2 unique values
+		if df[feature].nunique() == 2:
+			df[feature] = pd.get_dummies(df[feature], drop_first=True, prefix=feature)
+		# Dummy encoding for features with more than 2 unique values
+		elif 2 < df[feature].nunique() <= 16:
+			df = pd.concat(
+				[
+					df.drop([feature], axis=1),
+					pd.get_dummies(df[feature], drop_first=True, prefix=feature),
+				],
+				axis=1,
+			)
+	return df
+
+df = encode_categorical_features(df, valid_categorical_features)
+
+# Renames columns to avoid invalid characters
+df.columns = [col.replace("-", "_").replace(" ", "_") for col in df.columns]
+
+# Splitting the data into training & testing sets
+X = df.drop([target], axis=1)
+Y = df[target]
+Train_X, Test_X, Train_Y, Test_Y = train_test_split(
+	X, Y, train_size=0.8, test_size=0.2, random_state=100
+)
+
+# Feature Scaling (Standardization)
+std = StandardScaler()
+Train_X_std = pd.DataFrame(std.fit_transform(Train_X), columns=X.columns)
+Test_X_std = pd.DataFrame(std.transform(Test_X), columns=X.columns)
+
+#Random Forest Algorithm
+rf_model = RandomForestRegressor(random_state=42, n_estimators=200, max_depth=8, min_samples_split=12)
+rf_model.fit(Train_X_std, Train_Y)
+
+
+pred_train = rf_model.predict(Train_X_std)
+pred_test = rf_model.predict(Test_X_std)
+
+# Calculate RMSE for train and test sets
+# train_rmse = np.sqrt(mean_squared_error(Train_Y, pred_train))
+# test_rmse = np.sqrt(mean_squared_error(Test_Y, pred_test))
+
+
+def prepare_input_data(
+	area,
+	mainroad,
+	guestroom,
+	basement,
+	hotwaterheating,
+	airconditioning,
+	prefarea,
+	additional_bedrooms,
+	bathrooms,
+	stories,
+	parking,
+	furnishingstatus,
+):
+    # Creates a dictionary for the input features
+    input_data = {
+		"area": [area],
+		"mainroad": True if mainroad == "Yes" else False,
+		"guestroom": True if guestroom == "Yes" else False,
+		"basement": True if basement == "Yes" else False,
+		"hotwaterheating": True if hotwaterheating == "Yes" else False,
+		"airconditioning": True if airconditioning == "Yes" else False,
+		"prefarea": True if prefarea == "Yes" else False,
+		"bedrooms_2": additional_bedrooms == 2,
+		"bedrooms_3": additional_bedrooms == 3,
+		"bedrooms_4": additional_bedrooms == 4,
+		"bedrooms_5": additional_bedrooms == 5,
+		"bedrooms_6": additional_bedrooms == 6,
+		"bathrooms_2": bathrooms == 2,
+		"bathrooms_3": bathrooms == 3,
+		"bathrooms_4": bathrooms == 4,
+		"stories_2": stories == 2,
+		"stories_3": stories == 3,
+		"stories_4": stories == 4,
+		"parking_1": parking == 1,
+		"parking_2": parking == 2,
+		"parking_3": parking == 3,
+		"furnishingstatus_semi_furnished": furnishingstatus == "semi_furnished",
+		"furnishingstatus_unfurnished": furnishingstatus == "unfurnished",
+	}
+
+    return pd.DataFrame(input_data)
+
+# Note: Not removing this fxn because of the warning in predict.py file
+
+
+### Final Endpoint ###
+def get_predicted(area=0, mainroad=False, guestroom=False, basement=False, hotwaterheating=False,
+    airconditioning=False, prefarea=False,bedrooms=0, bathrooms=2,stories=1, parking=1,
+    furnishingstatus="semi_furnished",):
+
+	input_df = prepare_input_data(area, mainroad, guestroom,basement, hotwaterheating, airconditioning, prefarea,
+				bedrooms, bathrooms, stories, parking, furnishingstatus)
+
+	input_std = pd.DataFrame(std.transform(input_df), columns=input_df.columns)
+	predicted_price = rf_model.predict(input_std)
+	return round(predicted_price[0],2)
+
+def save_model():
+	# todo: Ask the user for the model name, and warn that the model will be overwritten
+
+	with open("./saved_models/model_02.pkl", "wb") as file:
+		pickle.dump(rf_model, file)
+
+
+def save_scaler():    
+	with open("./saved_models/scaler_02.pkl", "wb") as file:
+		pickle.dump(std, file)
+
+
+def get_evaluator():
+	evaluator = ModelEvaluation(rf_model, Train_X_std, Train_Y, Test_X_std, Test_Y)	
+	return evaluator
+
+if __name__ == "__main__":
+	save_model()
+	save_scaler()
+	# model_evaluation()
diff --git a/models/house_price/predict.py b/models/house_price/predict.py
@@ -1,6 +1,7 @@
 import pickle
 import pandas as pd
-from models.house_price.model import get_evaluator
+# from models.house_price.model import get_evaluator
+from models.house_price.ImprovedModel import get_evaluator
 
 """
 Predict.py file:
@@ -102,8 +103,8 @@ def get_prediction(
 	)
 
 	# Load the model and the scaler
-	model = load_model("models/house_price/saved_models/model_01.pkl")
-	scaler = load_model("models/house_price/saved_models/scaler_01.pkl")
+	model = load_model("models/house_price/saved_models/model_02.pkl")
+	scaler = load_model("models/house_price/saved_models/scaler_02.pkl")
 
 	# Scale the input data
 	input_scaled = scaler.transform(input_df)

diff --git a/models/house_price/saved_models/model_02.pkl b/models/house_price/saved_models/model_02.pkl
diff --git a/models/house_price/saved_models/scaler_02.pkl b/models/house_price/saved_models/scaler_02.pkl
diff --git a/page_handler.py b/page_handler.py
@@ -78,7 +78,7 @@ def render_model_details(self, model_module,tab):
 		if model_details_function:
 			metrics, prediction_plot, error_plot, performance_plot = model_details_function().evaluate()
 
-			st.subheader(f"Model Accuracy: {metrics['Test_R2']:.2%}")
+			st.subheader(f"Model Accuracy: {metrics['Test_R2']:.2f}")
 
 			#mentioning the title of the scores 
 			st.subheader(f"Scores: Training: {metrics['Train_R2']:.2f}, Testing: {metrics['Test_R2']:.2f}")