-
Notifications
You must be signed in to change notification settings - Fork 1
/
Latex_report.tex
1487 lines (1242 loc) · 57.9 KB
/
Latex_report.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Stylish Article
% LaTeX Template
% Version 2.1 (1/10/15)
%
% This template has been downloaded from:
% http://www.LaTeXTemplates.com
%
% Original author:
% Mathias Legrand ([email protected])
% With extensive modifications by:
% Vel ([email protected])
%
% License:
% CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/)
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%----------------------------------------------------------------------------------------
% PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
%----------------------------------------------------------------------------------------
\documentclass[fleqn,10pt]{SelfArx} % Document font size and equations flushed left
\usepackage[english]{babel} % Specify a different language here - english by default
\usepackage{lipsum} % Required to insert dummy text. To be removed otherwise
\usepackage{underscore}
%----------------------------------------------------------------------------------------
% COLUMNS
%----------------------------------------------------------------------------------------
\setlength{\columnsep}{0.55cm} % Distance between the two columns of text
\setlength{\fboxrule}{0.75pt} % Width of the border around the abstract
%----------------------------------------------------------------------------------------
% COLORS
%----------------------------------------------------------------------------------------
\definecolor{color1}{RGB}{0,0,90} % Color of the article title and sections
\definecolor{color2}{RGB}{0,20,20} % Color of the boxes behind the abstract and headings
%----------------------------------------------------------------------------------------
% HYPERLINKS
%----------------------------------------------------------------------------------------
\usepackage{hyperref} % Required for hyperlinks
\hypersetup{hidelinks,colorlinks,breaklinks=true,urlcolor=color2,citecolor=color1,linkcolor=color1,bookmarksopen=false,pdftitle={Title},pdfauthor={Author}}
%----------------------------------------------------------------------------------------
% ARTICLE INFORMATION
%----------------------------------------------------------------------------------------
\JournalInfo{Applied Machine Learning Fall 2018} % Journal information
\Archive{} % Additional notes (e.g. copyright, DOI, review/research article)
\PaperTitle{Semester Project - Home Credit Default Risk} % Article title
\Authors{Author: Utsav Patel} % Author
\Keywords{} % Keywords - if you don't want any simply remove all the text between the curly brackets
%\newcommand{\keywordname}{Keywords} % Defines the keywords heading name
%----------------------------------------------------------------------------------------
% ABSTRACT
%----------------------------------------------------------------------------------------
\Abstract{A bank provides loan only if the credit history of a customer is good. Cases when the credit history is not available, the bank has to rely on a hypothesis in order to provide loan to such customers. Our model can predict if a customer will default or not. It uses the historical loan application data to train. The challenge here is that there is more information on the non defaulters than on the defaulters in this dataset. We deal with several models and techniques and compare their performance and finally come up with the best model.}
%----------------------------------------------------------------------------------------
\begin{document}
\flushbottom % Makes all text pages the same height
\maketitle % Print the title and abstract box
\tableofcontents % Print the contents section
\thispagestyle{empty} % Removes page numbering from the first page
%----------------------------------------------------------------------------------------
% ARTICLE CONTENTS
%----------------------------------------------------------------------------------------
\section*{Introduction} % The \section*{} command stops section numbering
\addcontentsline{toc}{section}{Introduction} % Adds this section to the table of contents
%\lipsum[1-3] % Dummy text
Our project aims to use historical loan application data to predict whether or not an underserved applicant (a person with insufficient or no credit history) will be able to repay a loan. With an efficient model such as this, the banks and financial institutions can target just the potential customers. This will not only allow the banks to avoid spending resources unnecessarily but also provide a positive and safe borrowing [5] experience for the customer. The objective is especially eye catching considering the increase in the financial institutions over the years. We implement many supervised learning techniques such as Logistic Regression, Random Forest, K Nearest Neighbors, Decision Tree, Light GBM and XGBoost and compare each one of the results based on evaluation metrics We then choose the most efficient technique.
\pagebreak
%------------------------------------------------
\section{Models and Methodology}
%\begin{figure*}[ht]\centering % Using \begin{figure*} makes the figure take up the entire %width of the page
%\includegraphics[width=\linewidth]{view}
%\caption{Wide Picture}
%\label{fig:view}
%\end{figure*}
%\lipsum[4] % Dummy text
We use only the application train data since including other tables features lead to a reduction in their performance. LightGBM on the other hand uses the other tables and provides better accuracy as compared to XGBoost model. \\
\noindent
We also mentioned SVM in proposal but are instead implementing Decision Tree since Decision Tree perform better on unbalanced data.\\
\noindent
Below are the models we have implemented:\\
Logistic Regression\\
Random Forest\\
Decision Tree\\
K Nearest Neighbors\\
XGBoost\\
\subsection{Exploratory Data Analysis:}
The training data has 307511 observations (each one a separate loan) and 122 features including the TARGET (the label we want to predict). \\
\noindent
1) \textbf{Imbalance}\\
For EDA first we examine the distribution of the Target Column : 0 -282686 and 1 - 24825,where 0 indicates the loan was repaid on time and 1 indicates the [4]
client had payment difficulties. Imbalance-There are far more loans that were repaid on time than loans that were not repaid. i.e. there are more samples from class 0 than class1.
\\
Below are considerations to tackle this imbalance issue:\\
\begin{itemize}
\item We analyse (later) various imbalance techniques with respect to each model and see which technique works for which model.
\item We use only the f1_score and recall in order to determine and compare the different models and find the best working model. This is because if we classify a defaulter as non defaulter (i.e.1 as 0), it is much worse than classifying a non defaulter as defaulter(i.e.0 as 1). i.e. we are more interested in having more true positives but at the same time have one of the better f1_scores.
\end{itemize}
\noindent
\textbf{Handle the imbalance issue: }
When the samples between the two classes are not balanced then the model is more liable to learn about the majority class. This results in poorer classification of the minority class since there are not sufficient data available from the minority class. This behavior becomes more intense if the ratio of the majority to minority is very high – which is our case. There might be some models like decision tree that are exceptions to this scenario.\\
\noindent
We have applied four approaches to handle this issue:\\
\begin{itemize}
\item Oversampling:\\
In this method the data from the minority class is replicated so that a balance can be established between the two classes.
\item Undersampling:\\
In this method the data from the majority class is removed in order to balance the data.
\item Synthetic minority oversampling technique:\\
The over sampling of the minority dataset is done synthetically using an algorithm. Data from the minority class is not replicated. This prevents over fitting which is prevalent in oversampling.
\item Improve the cost function:\\
There are several approaches-[1] one which we are using is class weights. The class weights are added to the minority class so that the cost function accounts more for the error in minority class.\\
To determine the best f1_score at each noise level we do a grid search and get the best model to test against the test data in step2 explained in approaches section.
\end{itemize}
\textbf{Noise levels for each imbalance technique: }
\begin{itemize}
\item Sampling strategy- It is the ratio of the number of samples in minority class over the number of samples in the majority class. This parameter is used for OverSampling, UnderSampling and SMOTE.
\item Class weights- It specifies the weights to be given to the error from each class in the cost function. We set more weights on the minority class in order to make the model more sensitive to its errors on the minority class. This noise level is applicable for the cost function based approach.\\
\end{itemize}
\noindent
2) \textbf{Anomalies:}\textsubscript{4}
Using the analysis from the above unique values we treat the anomalies:\\
\begin{itemize}
\item DAYS_EMPLOYED- The maximum value - 365243 is about 1000 years. So we replaced all anomalous values with nan.
\item CODE_GENDER- Replace the value XNA with nan since M and F are the only possible values
\item DAYS_LAST_PHONE_CHANGE-Replace 0 with nan since 0 is not a possible value for this column
\end{itemize}
\noindent
3) \textbf{Label Encoding:}\\
\noindent
Next we encode the Categorical Variables:\\
\begin{itemize}
\item Label Encoding for any categorical variables with only 2 categories
\item One-Hot Encoding for any categorical variables with more than 2 categories. In total - 3 columns were label encoded.
\end{itemize}
4) \textbf{Missing Values:} \\
\noindent
Next we look at the number and percentage of missing values in each column There are 67 columns that have missing values. We used 3 different techniques for handling missing values. ([4] Note the code for finding the missing values are referenced but the strategies and how we treat them is our approach)\\
Below are details on the strategies we tried to handle the missing values.
\begin{itemize}
\item -999 strategy:\\
we replace every missing value with -999.
\item mean mode approach:\\
For every column with missing values (say eg. col):\\
Step1) Create a new column that has value 1 if col has missing value and has 0 otherwise.\\
Step2) Replace every missing value in col with mean if it is a numerical column and every missing value in categorical column with mode.\\
This method works for many reasons, firstly the new column added gives new information on missing values which it can use to give better accuracy.\\
\item Strategic imputer:\\
Below are the steps:\\
We identify the below datasets\\
DatasetA-There are 76 continous non categorical data\\
DatasetB-There are 46 categorical and continuous data.\\
We treat both datasets differently:\\
DatasetA:\\
- It has min value 0 and max value 1 – The missing values are replaced with the mode of the column\\
DatasetB:\\
- If it is an object perform one hot encoding\\
- If it is float then do mode of that column\\
\item Failed Attempts:\\
1) Aggregation \\
We aggregate the numeric columns based on the group by columns using the method mean or max.\\
Group by columns - numeric columns- method\\
1)'CODE_GENDER', 'NAME_EDUCATION_TYPE' - AMT_ANNUITY - max\\
'CODE_GENDER', 'ORGANIZATION_TYPE' -AMT_INCOME_TOTAL,DAYS_REGISTRATION -mean\\
2)'CODE_GENDER', 'REG_CITY_NOT_WORK_CITY'-'CNT_CHILDREN' - mean\\
3)'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE' - AMT_CREDIT, AMT_REQ_CREDIT_BUREAU_YEAR, APARTMENTS_AVG, BASEMENTAREA_AVG, NONLIVINGAREA_AVG, OWN_CAR_AGE, YEARS_BUILD_AVG -mean\\
4) 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'-ELEVATORS_AVG –mean\\
The resulting aggregated column has more correlation with the target column and works for some models like Random Forest but might increase correlation with the group by columns.\\
2) Normalization- We tried normalizing the data in application train but it reduced the performance hence was not used.\\
\end{itize}
\subsection{APPROACHES:}
\textbf{Experiment to find the best missing value strategy:}
\begin{itemize}
\item Step1) Treat the anomalies.
\item Step2) Perform the label encoding and one hot encoding on the categorical columns
\item Step3) Find the columns containing missing values- (this includes the newly one hot encoded columns)
\item Step4) Apply the -999 missing value strategy
\item Step5) Split the data into train and test set
\item Step6) Perform tree based feature selection on the train data. Find the feature importances. Based on these feature importances we subset the train data and the test data.
\item Step7) Perform a grid search for each model and find the best possible accuracy and the best hyper parameters.
\item Step8) Perform steps 1 to 6 for other missing value strategies
\item Step9) We use the hyper parameters found in Step7. If the performance is poor than -999 strategy , only then try to find the best hyperparameters for that missing value strategy.
\item Step10) Based on the results obtained we decide which method best works for which model.
\end{itemize}
\textbf{Experiment on the techniques to handle imbalance:}
\begin{itemize}
\item Step1) Perform the EDA of the dataset- anomalies and label encoding. Use the best performing missing value strategy for each model.
\item Step2) split the dataset into train and test
\item Step3) Only for the train dataset, do step4 onwards.
\item Step4) For each model, iterate over different levels of noise and repeat step 5 to 6 for each iteration
\item Step5) Perform feature engineering on the resampled train set.
\item Step6) Grid Search at each sampling strategy in order to fetch the best possible accuracy (f1_score) at each noise level. To determine the best f1_score for each noise we use the best grid model against the test data in step2.
\item Step7) Finally find the sweet spot - the value of sampling strategy where the model gives the highest f1 score.
\item Step8) Perform steps 2 to 7 for each imbalance technique and observe the best solutions for each model.
\end{itemize}
\subsection{Approach to combine all 7 tables:}
Combining all tables was really a great issue for us. This was because it was having too much of Missing values. The Missing value \% for a column was as high as 70\% in many cases. Below is the image for the same.\\
\includegraphics[width=\linewidth]{utsav1.png}
% \caption{In-text Picture}
\label{fig:results}
\noindent
This was true for all 7 tables. Moreover, if we combine those tables into one, all of them would be having much higher missing values.\\
\noindent
• Adding useful features: For almost all tables we entered our own columns by performing some logical operations on two columns that were already present. As a result at the end We had many self created columns in top 50 features from total of 798 features that we got after combining all tables. \\
\noindent
• Using Mean for person’s entries: Suppose a normal distribution. If we are having 1 Million entries of ages. Now suppose, we have to estimate another 1 Million data of ages that are missing. Any approach you got in mind? Best approach would be to assume that all have ages of around current mean which would be best estimate for missing values. I used this approach in indirectly filling NA’s. I did grouping by SK_ID_BUREAU and then found mean and entered as a column in separate dataframe that is used later. Thus if there are 50 entries of particular SK_ID_BUREAU, corresponding to those all, we will mean all values of other columns and make it as a entry. \\
\includegraphics[width=\linewidth]{utsav2.png}
% \caption{In-text Picture}
\label{fig:results}
\noindent
• Min mean max var: Moreover for some tables , we found the mean, minimum, maximum and variance of columns of ID’s by grouping them by SK_ID_CURR (for other tables this ID might differ).\\
\includegraphics[width=\linewidth]{utsav3.png}
% \caption{In-text Picture}
\label{fig:results}
\noindent
• Replacing anomalies: For some tables, we had to replace anomalies like age being 365243 and such entries with nan. We replaced it with nan because XGBoost and LightGBM handles missing values efficiently by guessing value that reduces log loss error rather than us imputing them with values that might increase log loss error. \\
\includegraphics[width=\linewidth]{utsav4.png}
% \caption{In-text Picture}
\label{fig:results}
\noindent
• Using Credit Active and Credit Inactive column: For bureau table we used credit active and inactive credit as a useful information and then we approached it in a manner explained in Mean, min, max and var (only for numerical columns). \\
\noindent
STEP 1:\\
\includegraphics[width=\linewidth]{utsav5.png}
% \caption{In-text Picture}
\label{fig:results}
\noindent
STEP 2:\\
\includegraphics[width=\linewidth]{utsav6.png}
% \caption{In-text Picture}
\label{fig:results}
\noindent
We did same for inactive credit entries also.\\
\noindent
• We approached in above mentioned 5 techniques for all the tables and at the end we had our dataframe exported to csv file.\\
\includegraphics[width=\linewidth]{utsav7.png}
% \caption{In-text Picture}
\label{fig:results}
\subsection{MODEL IMPLEMENTATION: }
Below are the models we have implemented:\\
Logistic Regression\\
Random Forest\\
Decision Tree\\
K Nearest Neighbors\\
XGBoost\\
\noindent
\textbf{Performance issues:}
There are two key factors that influence the performance of the models:\\
\begin{itemize}
\item The data is extremely unbalanced. Class 0 data is more than 10 times Class 1 in the training set:\\
\item The strategy applied to handle the missing values
({0: 226132, 1: 19876})
\end{itemize}
\pagebreak
\section{EXPERIMENTS AND RESULTS:}
\subsection{Experiment to find the best missing value strategy:}
1) Replace missing with -999\\
\includegraphics[width=\linewidth]{raja3.png}
% \caption{In-text Picture}
\label{fig:results}
2) Replace missing values with -999 after aggregation technique\\
\includegraphics[width=\linewidth]{raja4.png}
% \caption{In-text Picture}
\label{fig:results}
\pagebreak
3) Mean Mode Imputation:\\
\includegraphics[width=\linewidth]{raja5.png}
% \caption{In-text Picture}
\label{fig:results}
4) Mean Mode imputation after aggregation technique\\
\includegraphics[width=\linewidth]{raja6.png}
% \caption{In-text Picture}
\label{fig:results}
\pagebreak
5) Strategic Imputer:\\
\includegraphics[width=\linewidth]{raja7.png}
% \caption{In-text Picture}
\label{fig:results}
\noindent
OBSERVATIONS:\\
\noindent
1) The K Nearest model gives neighbor 1 as the best model, hence we have taken the second best as well i.e. 3 neighbors. 1 neighbor is also a valid hyperparameter , it works well especially in binary class problems.\\
\noindent
2) –The KNN model performs best with the -999 imputer, Strategic imputer (for neighbor3 it does not) and meanmode imputer while performing poorly on the strategies with aggregation.\\ It is possible that the aggregation lead to increase in correlation between the categorical groupby features and agg columns-which is not preferable-although the new agg columns have increased correlation with the output Y-which is preferable.\\
\noindent
3) The Random Forest performs best with meanmode and strategic imputer. Note here the -999 with agg performs better than without agg. Our guess is that the correlation issue explained in point 2 did not affect RF on account of the bootstrapping involved in its process.\\
\noindent
4) The Decision tree performs best with mean mode imputer.\\
\noindent
5) Logistic Regression –The -999 approach works the best closely followed by the mean mode technique. \\
\noindent
6) Overall the Mean mode technique of imputation works best for all models. Hence we choose this method as missing value strategy for all models.\\
\pagebreak
\subsection{Experiment on the techniques to handle imbalance:}
Below are the results and observations for each model:\\
\bigbreak
\noindent
\textbf{ DECISION TREE:}\\
\bigbreak
\noindent
\textbf{1) Oversampling:}\\
\noindent
F1_score at different levels of noise (noise-f1_score) :\\
(0.15, 0.145716),( 0.3,0.147995),(0.45, 0.142224),\\
( 0.75, 0.144683), (0.9, 0.135496)\\
\noindent
Best Sampling strategy-- 0.3\\
\noindent
Y train after resampling Counter({0: 226132, 1: 67839})\\
Improved number of features-- 90\\
\noindent
Best parameter on grid-- ('random_state': 42, 'max_features': 'auto', 'max_depth': 50, 'criterion': 'gini')\\
\noindent
DECISION TREE model:\\
Train Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $226130$ & $2$ & $$\\
\midrule
L = 1 & $3$ &$67836$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
Test Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $52080$ & $4474$ & $$\\
\midrule
L = 1 & $4196$ &$753$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.999 & 0.999 & 0.999 & 1.00 \\
Test & 0.144 & 0.152 & 0.147 & 0.536
\end{tabular}
\bigbreak
\noindent
\textbf{2) SMOTE:}\\
\bigbreak
\noindent
F1_score at different levels of noise (noise- f1_score) :\\
(0.15, 0.073491,(0.45, 0.083753),( 0.75, 0.083657), (0.9, 0.091882)\\
\noindent
Sampling strategy-- 0.9\\
\noindent
Y train after resampling Counter({0: 226132, 1: 203518})\\
Improved number of features- 111\\
Best parameter on grid-- ('random_state': 42, 'max_features': 'auto', 'max_depth': 20, 'criterion': 'gini')\\
\bigbreak
\noindent
DECISION TREE model:\\
\bigbreak
\noindent
Train Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $225182$ & $950$ & $$\\
\midrule
L = 1 & $14191$ &$189327$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
Test Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $5254463$ & $2091$ & $$\\
\midrule
L = 1 & $4610$ &$339$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.995 & 0.932 & 0.961 & 0.990 \\
Test & 0.139 & 0.068 & 0.091 & 0.589
\end{tabular}
\bigbreak
\noindent
\textbf{3) Under Sampling:}\\
F1_score at different levels of noise (noise-f1_score) :\\
(0.15, 0.163427),(0.6, 0.194770),( 0.75, 0.193454), (0.9, 0.185412) \\
\bigbreak
\noindent
Sampling strategy-- 0.6\\
Y train after resampling Counter({0: 33126, 1: 19876})\\
Improved number of features- 94\\
Best parameter on grid-- {'random_state': 42, 'max_features': 'auto', 'max_depth': 20, 'criterion': 'gini'}\\
\noindent
DECISION TREE model:\\
\noindent
Train Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $30749 $ & $2377$ & $$\\
\midrule
L = 1 & $3446 $ &$16430$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
Test Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $40102 $ & $16452$ & $$\\
\midrule
L = 1 & $2640 $ &$2309$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.873 & 0.826 & 0.849 & 0.963 \\
Test & 0.123 & 0.466 & 0.194 & 0.583
\end{tabular}
\bigbreak
\noindent
\textbf{4)COST FUNCTION BASED APPROACH:}
\bigbreak
\noindent
Class weights-- {0: 0.1, 1: 0.9}
Improved number of features- 95
Best parameter on grid-- {'random_state': 42, 'max_features': 'auto', 'max_depth': 20, 'criterion': 'gini'}
DECISION TREE model:
Train Confusion Matrix:
\bigbreak
\noindent
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $187596 $ & $38536$ & $$\\
\midrule
L = 1 & $1955 $ &$17921$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
Test Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $44608 $ & $11946$ & $$\\
\midrule
L = 1 & $2921 $ &$2028$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.317 & 0.901 & 0.469 & 0.941 \\
Test & 0.145 & 0.409 & 0.214 & 0.598
\end{tabular}
\textbf{Observations:}
\begin{itemize}
\item Oversampling-we observed that introduction of more noise i.e. replicating the minority class only leads to overfitting and the decision tree performs poorly. This is possibly the reason why the best result is found at such low sampling strategy (0.3).
\item Interestingly the best f1 score obtained at 0.3 in oversampling is even worse than the BASE model. This is because the model trains on same samples of the minority class multiple times due to replication and eventually trains specific to the samples themselves and not on the class features.
\item One of the better results are with undersampling and worst with SMOTE. Similar to RF it is unable to learn and benefit from the new synthetic samples. Although it benefits from the undersampling strategy where the balance is set at 0.6 and a recal of 0.46 and f1_score 0.19 is achieved. The results at undersampling and oversampling are 10 times better than that at Smote.
\item It works the best with the cost function based approach with f1_score more than 0.2 and recall more than 0.4.
\end{itemize}
\bigbreak
\noindent
\textbf{ Random Forest:}\\
\newline
\noindent
\textbf{1)Oversampling:}\\
\newline
\noindent
F1_score at different levels of noise (noise-f1_score) :\\
(0.15- 0.039771), (0.45, 0.063275), (0.75, 0.067977), ( 0.9, 0.069861)
\noindent
Best Sampling strategy-- 0.9\\
\newline
Y train after resampling Counter({0: 226132, 1: 203518})\\
Improved number of features- 85\\
Best parameter on grid-- {'n_estimators': 17, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': True}\\
Random Forest model:\\
\bigbreak
\noindent
Train Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $226131$ & $1$ & $$\\
\midrule
L = 1 & $0$ &$203518$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
Test Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $56226$ & $328$ & $$\\
\midrule
L = 1 & $4758$ &$191$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.999 & 1.00 & 0.999 & 1.00 \\
Test & 0.368 & 0.0385 & 0.069 & 0.682
\end{tabular}
\bigbreak
\noindent
\textbf{SMOTE: }
F1_score at different levels of noise (noise-f1_score) :\\
(0.15- 0.201405), (0.45, 0.021696), (0.75, 0.017765), ( 0.9, 0.015032)\\
\bigbreak
\noindent
Y train before resampling Counter({0: 226132, 1: 19876})\\
Sampling strategy-- 0.15\\
\bigbreak
\noindent
Y train after resampling Counter({0: 226132, 1: 33919})\\
Improved number of features- 105\\
Best parameter on grid-- {'n_estimators': 13, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': True}\\
\bigbreak
\noindent
Random Forest model:
\bigbreak
\noindent
Train Confusion Matrix:\\
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $226129$ & $3$ & $$\\
\midrule
L = 1 & $1513 $ &$32406 $ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
Test Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $56395 $ & $159$ & $$\\
\midrule
L = 1 & $4859 $ &$90$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.999 & 0.955 & 0.977 & 0.999 \\
Test & 0.361 & 0.018 & 0.034 & 0.651
\end{tabular}
\bigbreak
\noindent
\pagebreak
\noindent
\textbf{UNDERSAMPLING:}
\newline
\newline
\noindent
F1_score at different levels of noise (noise-f1_score) :\\
(0.15:- 0.034629), (0.6, 0.255757), (0.75, 0.246021), ( 0.9, 0.239339)\\
\bigbreak
\noindent
Sampling strategy-- 0.6\\
Y train after resampling Counter({0: 33126, 1: 19876})\\
Improved number of features- 96\\
Best parameter on grid-- {'n_estimators': 17, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': True}\\
\bigbreak
\noindent
Random Forest model:\\
\bigbreak
\noindent
Train Confusion Matrix:\\
\bigbreak
\noindent
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $33099 $ & $27$ & $$\\
\midrule
L = 1 & $148 $ &$19728 $ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
Test Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $46656 $ & $9898$ & $$\\
\midrule
L = 1 & $2772 $ &$2177$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.998 & 0.992 & 0.995 & 0.999 \\
Test & 0.180 & 0.439 & 0.255 & 0.700
\end{tabular}
\bigbreak
\noindent
Sampling strategy-- 0.9\\
Y train after resampling Counter({0: 22084, 1: 19876})\\
Improved number of features- 95\\
Best parameter on grid-- {'n_estimators': 17, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': True}\\
\bigbreak
\noindent
\noindent
Random Forest model:\\
\bigbreak
\noindent
Train Confusion Matrix:\\
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $22050$ & $34$ & $$\\
\midrule
L = 1 & $68 $ &$1980$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
\noindent
Test Confusion Matrix:
\bigbreak
\noindent
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $39331$ & $17223$ & $$\\
\midrule
L = 1 & $1935$ &$3014$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.998 & 0.996 & 0.997 & 0.999 \\
Test & 0.148 & 0.609 & 0.239 & 0.705
\end{tabular}
\bigbreak
\noindent
COST FUNCTION BASED APPROACH:\\
\bigbreak
\noindent
Train Confusion Matrix:\\
Class weights-- {0: 0.6, 1: 0.4}\\
Improved number of features- 95\\
Best parameter on grid-- {'n_estimators': 5, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': True}\\
\bigbreak
\noindent
Random Forest model:\\
\bigbreak
\noindent
Train Confusion Matrix:\\
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $225899 $ & $233$ & $$\\
\midrule
L = 1 & $3616 $ &$16260$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
Train Confusion Matrix:\\
Test Confusion Matrix:
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $55684 $ & $870$ & $$\\
\midrule
L = 1 & $4652 $ &$297$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.958 & 0.818 & 0.894 & 0.993 \\
Test & 0.254 & 0.060 & 0.097 & 0.607
\end{tabular}
\bigbreak
\noindent
\textbf{Observation:}\\
\begin{itemize}
\item Oversampling:\\
The performance of Random Forest increases with more samples from the minority class. It has much less tendency to overfit when compared to the decision tree. This is due to the bootstrapping which is done as part of random forest that involves random resampling of data with replacement.\\
\item We also observe that the number of trees (estimators) are very low. This is possibly due to the data being imbalanced it tends to converge to a decision tree. i.e. there are not enough samples from the minority class in the trees inside the random forest.\\
\item The highest performance with oversampling is less than the f1_score without oversampling.\\
\item Smote:\\
The performance with Smote is the lowest, the random forest is unable to learn the new samples added since they are different from the actual dataset.\\
\item Undersampling:\\
RF performs the best with undersampling with f1_score reaching peak at 0.26 at 0.6 sampling strategy and recall of 0.6 at 0.9 sampling strategy. This shows how sensitive RF is with imbalanced data. With balanced data its f1_score is 10 times that before resampling\\
\item Cost function based approach is not that helpful with RF.
\end{itemize}
\textbf{ Logistic Regression:}\\
\noindent
\textbf{1)OVER SAMPLING:}\\
\noindent
F1_score at different levels of noise (noise-f1_score):\\
(0.15, 0.101225),( 0.45, 0.291804),(0.6, 0.293157),
(0.75, 0.283418),(0.9, 0.266181) \\
Recall score at different levels of noise (noise-recall):\\
(0.15, 0.057587 ),( 0.45, 0.358254),(0.6, 0.478683),\\
(0.75,0.577288),(0.9,0.642756)\\
\noindent
Sampling strategy-- 0.9\\
Y train after resampling Counter({0: 226132, 1: 203518})\\
Improved number of features- 86\\
Best parameter on grid-- {'penalty': 'l1', 'C': 0.5}\\
\newline
\noindent
Logistic Regression model:
\bigbreak
\noindent
Train Confusion Matrix:
\bigbreak
\noindent
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $163385 $ & $62747$ & $$\\
\midrule
L = 1 & $72562 $ &$130956$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
Test Confusion Matrix:
\bigbreak
\noindent
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $40783 $ & $15771$ & $$\\
\midrule
L = 1 & $1768 $ &$3181$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.676 & 0.643 & 0.659 & 0.748 \\
Test & 0.167 & 0.642 & 0.266 & 0.748
\end{tabular}
\noindent
Sampling strategy-- 0.6\\
\noindent
Y train after resampling Counter({0: 226132, 1: 135679})\\
Improved number of features- 87\\
Best parameter on grid-- {'penalty': 'l1', 'C': 0.2}\\
\bigbreak
\noindent
Logistic Regression model:\\
\bigbreak
\noindent
Train Confusion Matrix:
\bigbreak
\noindent
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $190761 $ & $35371$ & $$\\
\midrule
L = 1 & $71021 $ &$64658$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
Test Confusion Matrix:
\bigbreak
\noindent
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $47710 $ & $8844$ & $$\\
\midrule
L = 1 & $2580 $ &$2369$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.646 & 0.476 & 0.548 & 0.748 \\
Test & 0.211 & 0.478 & 0.293 & 0.748
\end{tabular}
\bigbreak
\noindent
\textbf{2) SMOTE:}
\bigbreak
\noindent
F1_score at different levels of noise (noise-f1_score):\\
(0.15, 0.115375),( 0.45, 0.295989),(0.6, 0.293079),\\
(0.75, 0.283418),(0.9, 0.279562) \\
\noindent
Sampling strategy-- 0.45\\
Y train after resampling Counter({0: 226132, 1: 101759})\\
Improved number of features- 112\\
Best parameter on grid-- {'penalty': 'l1', 'C': 0.8}\\
\bigbreak
\noindent
Logistic Regression model:\\
\bigbreak
\noindent
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $201569 $ & $24563$ & $$\\
\midrule
L = 1 & $64445 $ &$37314$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
Test Confusion Matrix:
\bigbreak
\noindent
\begin{tabular}{c|c|c||c}
$$ & $\hat{\mathrm{L}} = 0$ & $\hat{\mathrm{L}} = 1$ \\
\toprule
L = 0 & $50415 $ & $6139$ & $$\\
\midrule
L = 1 & $3023 $ &$1926$ & $$\\ \hline \hline
& $$ & $$ &\end{tabular}
\bigbreak
\noindent
\begin{tabular}{lllll}
& PRECISION & RECALL & F1\_SCORE & AUC \\
Train & 0.603 & 0.366 & 0.456 & 0.764 \\
Test & 0.238 & 0.398 & 0.295 & 0.747
\end{tabular}
\bigbreak
\noindent
\textbf{UNDERSAMPLING:}
\bigbreak
\noindent
F1_score at different levels of noise (noise- f1_score) :\\
(0.15- 0.039771), (0.45, 0.063275), (0.75, 0.067977), ( 0.9, 0.069861)\\
\noindent