modeldoneinpy.py

# -*- coding: utf-8 -*-
"""train.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1tXCMlLyiCLHQ-5oRiihWAffyVTBUEuVl
"""

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

!pip install seaborn

# Commented out IPython magic to ensure Python compatibility.
import seaborn as sns
import math
import matplotlib.pyplot as plt
# %matplotlib inline

train_data=pd.read_csv('https://raw.githubusercontent.com/nimishbongale/devengers-hackathon/master/trainms.csv?token=AKLHGWLWGVVVQGQRKTZATRK5ONMUM')
test_data=pd.read_csv('https://raw.githubusercontent.com/nimishbongale/devengers-hackathon/master/testms.csv?token=AKLHGWNGNCHEF4H5ZPQWUWC5ONM5Q')
train_data

test_data

train_data=pd.concat([train_data,test_data])
train_data

train_data.columns

train_data.drop(['s.no', 'Timestamp', 'leave', 'coworkers', 'benefits', 'care_options', 'anonymity', ],axis=1,inplace=True)

train_data.info()

train_data.isnull()

train_data["self_employed"].fillna("No",inplace=True)
train_data.isnull()

sns.heatmap(train_data.isnull(),yticklabels=False,cmap="viridis")

def rem(train_data,field):
    remwork=pd.get_dummies(train_data[field],drop_first=True)
    train_data = pd.concat([train_data,remwork],axis=1)
    train_data.drop([field],axis=1,inplace=True)
    train_data.head()

state=pd.get_dummies(train_data["state"],drop_first=True)
state.head()

workinter=pd.get_dummies(train_data["work_interfere"],drop_first=True)
workinter.head()

train_data = pd.concat([train_data,state,workinter],axis=1)

train_data.head()

train_data.drop(['state','work_interfere'],axis=1,inplace=True)
train_data.head()

sns.heatmap(train_data.isnull(),yticklabels=False,cmap="viridis")

sns.heatmap(train_data.isnull(),yticklabels=False,cbar=False)

print(sum(train_data.isnull().sum()))

train_data.loc[:, train_data.isna().any()]

train_data.drop(['comments'],axis=1,inplace=True)
train_data.head()

train_data.isnull().sum()

def convert(str):
    if str=='m' or str=='Male' or str=='male' or str=='Male ' or str=='Mal' or str=='Mail' or str=='Make' or str=='Guy (-ish) ^_^' or str=='maile' or str=='something kinda male?':
        return 'M'
    elif str=='f' or str=='Female' or str=='female' or str=='Femake' or str=='Female ' or str=='Woman' or str=='woman':
        return 'F'
    else:
        return str
train_data["Gender"]=train_data["Gender"].apply(convert)

Gender=pd.get_dummies(train_data["Gender"],drop_first=True)
print(Gender.columns)

train_data = pd.concat([train_data,Gender],axis=1)
train_data.drop(['Gender'],axis=1,inplace=True)
train_data.head()

country=pd.get_dummies(train_data["Country"],drop_first=True)
country.head()

train_data = pd.concat([train_data,country],axis=1)
train_data.drop(['Country'],axis=1,inplace=True)
train_data.head()

se=pd.get_dummies(train_data["self_employed"],drop_first=True)
se.head()
train_data = pd.concat([train_data,se],axis=1)
train_data.drop(['self_employed'],axis=1,inplace=True)
train_data.head()

famhis=pd.get_dummies(train_data["family_history"],drop_first=True)
famhis.head()
train_data = pd.concat([train_data,famhis],axis=1)
train_data.drop(['family_history'],axis=1,inplace=True)
train_data.head()

remwork=pd.get_dummies(train_data["remote_work"],drop_first=True)
remwork.head()
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['remote_work'],axis=1,inplace=True)
train_data.head()

print(train_data.columns)

remwork=pd.get_dummies(train_data["tech_company"],drop_first=True)
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['tech_company'],axis=1,inplace=True)
train_data.head()

remwork=pd.get_dummies(train_data["no_employees"],drop_first=True)
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['no_employees'],axis=1,inplace=True)
train_data.head()

remwork=pd.get_dummies(train_data["wellness_program"],drop_first=True)
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['wellness_program'],axis=1,inplace=True)
train_data.head()

train_data.head()

remwork=pd.get_dummies(train_data["seek_help"],drop_first=True)
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['seek_help'],axis=1,inplace=True)
train_data.head()

remwork=pd.get_dummies(train_data["mental_health_consequence"],drop_first=True)
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['mental_health_consequence'],axis=1,inplace=True)
train_data.head()

remwork=pd.get_dummies(train_data["phys_health_consequence"],drop_first=True)
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['phys_health_consequence'],axis=1,inplace=True)
train_data.head()

remwork=pd.get_dummies(train_data["supervisor"],drop_first=True)
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['supervisor'],axis=1,inplace=True)
train_data.head()

remwork=pd.get_dummies(train_data["mental_health_interview"],drop_first=True)
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['mental_health_interview'],axis=1,inplace=True)
train_data.head()

remwork=pd.get_dummies(train_data["phys_health_interview"],drop_first=True)
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['phys_health_interview'],axis=1,inplace=True)
train_data.head()

remwork=pd.get_dummies(train_data["mental_vs_physical"],drop_first=True)
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['mental_vs_physical'],axis=1,inplace=True)
train_data.head()

remwork=pd.get_dummies(train_data["obs_consequence"],drop_first=True)
train_data = pd.concat([train_data,remwork],axis=1)
train_data.drop(['obs_consequence'],axis=1,inplace=True)
train_data.head()

train_data["treatment"]=train_data["treatment"].apply(lambda x: 1 if x=='Yes' else 0 if x=='No' else -1)
train_data

test_data

test_data=train_data.iloc[1000:]
train_data=train_data.iloc[:1000]

train_data.columns

test_data.columns

Y=train_data["treatment"]
X=train_data.drop(['treatment'], axis=1)

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
regressor=RandomForestClassifier(n_estimators=1000,criterion = 'entropy',random_state=0)
regressor.fit(X_train,Y_train)

Y_pred=regressor.predict(X_test)
mean_squared_error(Y_test,Y_pred)

from sklearn.ensemble import ExtraTreesClassifier
regressor=ExtraTreesClassifier(n_estimators=1000,max_features= 100,criterion= 'entropy',min_samples_split= 2,max_depth= 50, min_samples_leaf= 5)
regressor.fit(X_train,Y_train)

Y_pred=regressor.predict(X_test)
mean_squared_error(Y_test,Y_pred)

from sklearn.metrics import accuracy_score
accuracy_score(Y_test,Y_pred)

test_data=test_data.drop(['treatment'], axis=1)

Y_final_pred=regressor.predict(test_data)

for item in Y_final_pred:
    print("Yes" if item==1 else "No")

import csv

row = ["Yes" if i==1 else "No" for i in Y_final_pred]
with open('result.csv', 'a') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(row)
csvFile.close()

!ls

from google.colab import files
files.download('result.csv')