Merge branch 'master' into Insurance_Cost_Predictor

yashasvini121 · Oct 26, 2024 · 3a9e7b4 · 3a9e7b4
2 parents 47a4c78 + a6cf6f8
commit 3a9e7b4
Show file tree

Hide file tree

Showing 8 changed files with 408 additions and 141 deletions.
diff --git a/App.py b/App.py
diff --git a/assets/images/machine_learning.png b/assets/images/machine_learning.png
diff --git a/models/house_price/ImprovedModel.py b/models/house_price/ImprovedModel.py
@@ -0,0 +1,196 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.feature_selection import RFE
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+import warnings
+import pickle
+from .ModelEvaluation import ModelEvaluation
+import os
+import logging
+import streamlit as st
+import numpy as np
+warnings.filterwarnings("ignore")
+
+# Define the directory for logs
+log_directory = 'models/house_price/logs'
+os.makedirs(log_directory, exist_ok=True) # Create the directory if it doesn't exist
+
+# Set up logging
+log_file = os.path.join(log_directory, 'model_training.log')
+logging.basicConfig(
+	filename=log_file,
+	level=logging.INFO,
+	format='%(asctime)s - %(levelname)s - %(message)s'
+)
+
+df = pd.read_csv("models/house_price/data/housing.csv")
+original_df = df.copy(deep=True)
+
+# Target and Feature Identification
+target = "price"
+features = [col for col in df.columns if col != target]
+
+# Separates numerical and categorical features based on unique values
+nu = df[features].nunique()
+numerical_features = [col for col in features if nu[col] > 16]
+categorical_features = [col for col in features if nu[col] <= 16]
+
+# Removing outliers using IQR
+def remove_outliers(df, numerical_features):
+	for feature in numerical_features:
+		Q1 = df[feature].quantile(0.25)
+		Q3 = df[feature].quantile(0.75)
+		IQR = Q3 - Q1
+		df = df[(df[feature] >= (Q1 - 1.5 * IQR)) & (df[feature] <= (Q3 + 1.5 * IQR))]
+	return df.reset_index(drop=True)
+
+
+# Handling missing values
+def handle_missing_values(df):
+	null_summary = df.isnull().sum()
+	null_percentage = (null_summary / df.shape[0]) * 100
+	return pd.DataFrame(
+		{"Total Null Values": null_summary, "Percentage": null_percentage}
+	).sort_values(by="Percentage", ascending=False)
+
+
+# Removes outliers from numerical features
+df = remove_outliers(df, numerical_features)
+
+# Filters categorical features without missing values
+null_value_summary = handle_missing_values(df)
+valid_categorical_features = [
+	col
+	for col in categorical_features
+	if col not in null_value_summary[null_value_summary["Percentage"] != 0].index
+]
+
+# Encoding categorical features
+def encode_categorical_features(df, categorical_features):
+	for feature in categorical_features:
+		# Binary encoding for features with 2 unique values
+		if df[feature].nunique() == 2:
+			df[feature] = pd.get_dummies(df[feature], drop_first=True, prefix=feature)
+		# Dummy encoding for features with more than 2 unique values
+		elif 2 < df[feature].nunique() <= 16:
+			df = pd.concat(
+				[
+					df.drop([feature], axis=1),
+					pd.get_dummies(df[feature], drop_first=True, prefix=feature),
+				],
+				axis=1,
+			)
+	return df
+
+df = encode_categorical_features(df, valid_categorical_features)
+
+# Renames columns to avoid invalid characters
+df.columns = [col.replace("-", "_").replace(" ", "_") for col in df.columns]
+
+# Splitting the data into training & testing sets
+X = df.drop([target], axis=1)
+Y = df[target]
+Train_X, Test_X, Train_Y, Test_Y = train_test_split(
+	X, Y, train_size=0.8, test_size=0.2, random_state=100
+)
+
+# Feature Scaling (Standardization)
+std = StandardScaler()
+Train_X_std = pd.DataFrame(std.fit_transform(Train_X), columns=X.columns)
+Test_X_std = pd.DataFrame(std.transform(Test_X), columns=X.columns)
+
+#Random Forest Algorithm
+rf_model = RandomForestRegressor(random_state=42, n_estimators=200, max_depth=8, min_samples_split=12)
+rf_model.fit(Train_X_std, Train_Y)
+
+
+pred_train = rf_model.predict(Train_X_std)
+pred_test = rf_model.predict(Test_X_std)
+
+# Calculate RMSE for train and test sets
+# train_rmse = np.sqrt(mean_squared_error(Train_Y, pred_train))
+# test_rmse = np.sqrt(mean_squared_error(Test_Y, pred_test))
+
+
+def prepare_input_data(
+	area,
+	mainroad,
+	guestroom,
+	basement,
+	hotwaterheating,
+	airconditioning,
+	prefarea,
+	additional_bedrooms,
+	bathrooms,
+	stories,
+	parking,
+	furnishingstatus,
+):
+    # Creates a dictionary for the input features
+    input_data = {
+		"area": [area],
+		"mainroad": True if mainroad == "Yes" else False,
+		"guestroom": True if guestroom == "Yes" else False,
+		"basement": True if basement == "Yes" else False,
+		"hotwaterheating": True if hotwaterheating == "Yes" else False,
+		"airconditioning": True if airconditioning == "Yes" else False,
+		"prefarea": True if prefarea == "Yes" else False,
+		"bedrooms_2": additional_bedrooms == 2,
+		"bedrooms_3": additional_bedrooms == 3,
+		"bedrooms_4": additional_bedrooms == 4,
+		"bedrooms_5": additional_bedrooms == 5,
+		"bedrooms_6": additional_bedrooms == 6,
+		"bathrooms_2": bathrooms == 2,
+		"bathrooms_3": bathrooms == 3,
+		"bathrooms_4": bathrooms == 4,
+		"stories_2": stories == 2,
+		"stories_3": stories == 3,
+		"stories_4": stories == 4,
+		"parking_1": parking == 1,
+		"parking_2": parking == 2,
+		"parking_3": parking == 3,
+		"furnishingstatus_semi_furnished": furnishingstatus == "semi_furnished",
+		"furnishingstatus_unfurnished": furnishingstatus == "unfurnished",
+	}
+
+    return pd.DataFrame(input_data)
+
+# Note: Not removing this fxn because of the warning in predict.py file
+
+
+### Final Endpoint ###
+def get_predicted(area=0, mainroad=False, guestroom=False, basement=False, hotwaterheating=False,
+    airconditioning=False, prefarea=False,bedrooms=0, bathrooms=2,stories=1, parking=1,
+    furnishingstatus="semi_furnished",):
+
+	input_df = prepare_input_data(area, mainroad, guestroom,basement, hotwaterheating, airconditioning, prefarea,
+				bedrooms, bathrooms, stories, parking, furnishingstatus)
+
+	input_std = pd.DataFrame(std.transform(input_df), columns=input_df.columns)
+	predicted_price = rf_model.predict(input_std)
+	return round(predicted_price[0],2)
+
+def save_model():
+	# todo: Ask the user for the model name, and warn that the model will be overwritten
+
+	with open("./saved_models/model_02.pkl", "wb") as file:
+		pickle.dump(rf_model, file)
+
+
+def save_scaler():    
+	with open("./saved_models/scaler_02.pkl", "wb") as file:
+		pickle.dump(std, file)
+
+
+def get_evaluator():
+	evaluator = ModelEvaluation(rf_model, Train_X_std, Train_Y, Test_X_std, Test_Y)	
+	return evaluator
+
+if __name__ == "__main__":
+	save_model()
+	save_scaler()
+	# model_evaluation()
diff --git a/models/house_price/predict.py b/models/house_price/predict.py
@@ -1,6 +1,7 @@
 import pickle
 import pandas as pd
-from models.house_price.model import get_evaluator
+# from models.house_price.model import get_evaluator
+from models.house_price.ImprovedModel import get_evaluator
 
 """
 Predict.py file:
@@ -102,8 +103,8 @@ def get_prediction(
 	)
 
 	# Load the model and the scaler
-	model = load_model("models/house_price/saved_models/model_01.pkl")
-	scaler = load_model("models/house_price/saved_models/scaler_01.pkl")
+	model = load_model("models/house_price/saved_models/model_02.pkl")
+	scaler = load_model("models/house_price/saved_models/scaler_02.pkl")
 
 	# Scale the input data
 	input_scaled = scaler.transform(input_df)

diff --git a/models/house_price/saved_models/model_02.pkl b/models/house_price/saved_models/model_02.pkl
diff --git a/models/house_price/saved_models/scaler_02.pkl b/models/house_price/saved_models/scaler_02.pkl
diff --git a/page_handler.py b/page_handler.py
@@ -78,7 +78,7 @@ def render_model_details(self, model_module,tab):
 		if model_details_function:
 			metrics, prediction_plot, error_plot, performance_plot = model_details_function().evaluate()
 
-			st.subheader(f"Model Accuracy: {metrics['Test_R2']:.2%}")
+			st.subheader(f"Model Accuracy: {metrics['Test_R2']:.2f}")
 
 			#mentioning the title of the scores 
 			st.subheader(f"Scores: Training: {metrics['Train_R2']:.2f}, Testing: {metrics['Test_R2']:.2f}")

diff --git a/readme.md b/readme.md
@@ -13,7 +13,7 @@ The project has been successfully tested in local environments, and current effo
 2. Or create new issues to discuss new ideas, suggest features, or report bugs.
 3. Fork the repository and create a new branch for your contribution.
 4. Implement your changes and submit a pull request with a clear description.
-5. Futher details can be found in the [contributing.md](contributing.md) file.
+5. Further details can be found in the [contributing.md](contributing.md) file.
 
 ## Setup Instructions
 1. Fork or clone the repository.