Skip to content

Commit

Permalink
added .py file #206
Browse files Browse the repository at this point in the history
  • Loading branch information
SimranShaikh20 committed Nov 8, 2024
1 parent f66a554 commit 80fb3be
Showing 1 changed file with 222 additions and 0 deletions.
222 changes: 222 additions & 0 deletions pages/car_price_predictor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
# -*- coding: utf-8 -*-
"""car_price_predictor.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1rnMmmgkB7PlOcx__mXoXdJ9HYTBxE_TP
# Car Price Predictor
"""

# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
# %matplotlib inline
mpl.style.use('ggplot')

car=pd.read_csv('/content/drive/MyDrive/Data Sets/car.csv')

car.head()

car.shape

car.info()

"""##### Creating backup copy"""

backup=car.copy()

car['year'].unique()

car['Price'].unique()

car['kms_driven'].unique()

car['fuel_type'].unique()

"""## Quality
- names are pretty inconsistent
- names have company names attached to it
- some names are spam like 'Maruti Ertiga showroom condition with' and 'Well mentained Tata Sumo'
- company: many of the names are not of any company like 'Used', 'URJENT', and so on.
- year has many non-year values
- year is in object. Change to integer
- Price has Ask for Price
- Price has commas in its prices and is in object
- kms_driven has object values with kms at last.
- It has nan values and two rows have 'Petrol' in them
- fuel_type has nan values
## Cleaning Data
#### year has many non-year values
"""

car=car[car['year'].str.isnumeric()]

"""#### year is in object. Change to integer"""

car['year']=car['year'].astype(int)

"""#### Price has Ask for Price"""

car=car[car['Price']!='Ask For Price']

"""#### Price has commas in its prices and is in object"""

car['Price']=car['Price'].str.replace(',','').astype(int)

"""#### kms_driven has object values with kms at last."""

car['kms_driven']=car['kms_driven'].str.split().str.get(0).str.replace(',','')

"""#### It has nan values and two rows have 'Petrol' in them"""

car=car[car['kms_driven'].str.isnumeric()]

car['kms_driven']=car['kms_driven'].astype(int)

"""#### fuel_type has nan values"""

car=car[~car['fuel_type'].isna()]

car.shape

"""### name and company had spammed data...but with the previous cleaning, those rows got removed.
#### Company does not need any cleaning now. Changing car names. Keeping only the first three words
"""

car['name']=car['name'].str.split().str.slice(start=0,stop=3).str.join(' ')

"""#### Resetting the index of the final cleaned data"""

car=car.reset_index(drop=True)

"""## Cleaned Data"""

car

car.to_csv('Cleaned_Car_data.csv')

car.info()

car.describe(include='all')

"""### Checking relationship of Company with Price"""

car['company'].unique()

import seaborn as sns

plt.subplots(figsize=(15,7))
ax=sns.boxplot(x='company',y='Price',data=car)
ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
plt.show()

"""### Checking relationship of Year with Price"""

plt.subplots(figsize=(20,10))
ax=sns.swarmplot(x='year',y='Price',data=car)
ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
plt.show()

"""### Checking relationship of kms_driven with Price"""

sns.relplot(x='kms_driven',y='Price',data=car,height=7,aspect=1.5)

"""### Checking relationship of Fuel Type with Price"""

plt.subplots(figsize=(14,7))
sns.boxplot(x='fuel_type',y='Price',data=car)

"""### Relationship of Price with FuelType, Year and Company mixed"""

ax=sns.relplot(x='company',y='Price',data=car,hue='fuel_type',size='year',height=7,aspect=2)
ax.set_xticklabels(rotation=40,ha='right')

"""### Extracting Training Data"""

X=car[['name','company','year','kms_driven','fuel_type']]
y=car['Price']

X

y.shape

"""### Applying Train Test Split"""

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

"""#### Creating an OneHotEncoder object to contain all the possible categories"""

ohe=OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

"""#### Creating a column transformer to transform categorical columns"""

column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']),
remainder='passthrough')

"""#### Linear Regression Model"""

lr=LinearRegression()

"""#### Making a pipeline"""

pipe=make_pipeline(column_trans,lr)

"""#### Fitting the model"""

pipe.fit(X_train,y_train)

y_pred=pipe.predict(X_test)

"""#### Checking R2 Score"""

r2_score(y_test,y_pred)

scores=[]
for i in range(1000):
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
lr=LinearRegression()
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
scores.append(r2_score(y_test,y_pred))

np.argmax(scores)

scores[np.argmax(scores)]

pipe.predict(pd.DataFrame(columns=X_test.columns,data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']).reshape(1,5)))

"""#### The best model is found at a certain random state"""

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=np.argmax(scores))
lr=LinearRegression()
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

import pickle

pickle.dump(pipe,open('LinearRegressionModel.pkl','wb'))

pipe.predict(pd.DataFrame(columns=['name','company','year','kms_driven','fuel_type'],data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']).reshape(1,5)))

pipe.steps[0][1].transformers[0][1].categories[0]

0 comments on commit 80fb3be

Please sign in to comment.