-
Notifications
You must be signed in to change notification settings - Fork 0
/
ML2_Project.Rmd
1641 lines (1335 loc) · 69.8 KB
/
ML2_Project.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Analyzing indonesian rice farms"
author: "Jonas Kernebeck, Alexander Flick, Felix Lehner"
date: "01/11/2022"
output:
pdf_document:
toc: yes
toc_depth: '2'
html_document:
toc: yes
toc_depth: 2
word_document:
toc: yes
toc_depth: '2'
---
<!--
packages
install.packages("plm")
install.packages("splm")
install.packages("GGally")
install.packages("heatmaply")
install.packages("tidyverse")
install.packages("corrr")
install.packages("devtools")
devtools::install_github("hadley/productplots")
-->
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE)
```
\newpage
```{r imported libraries}
library(plm)
library(GGally)
library(heatmaply)
require('tidyverse')
library(tidyr)
library(ggmosaic)
library(gridExtra)
library(gam)
library(patchwork)
library(ggcorrplot)
library(kableExtra)
library(formattable)
library(ggplot2)
library(productplots)
library(plm)
library(glmnet)
library(leaps)
```
```{r constants}
data("RiceFarms", package = "plm")
col_to_remove = !names(RiceFarms) %in% c("id", "noutput")
RiceFarms = RiceFarms[,col_to_remove]
```
```{r data}
# some code here
```
# 1 Introduction
The present data set includes multiple production data for 171 indonesian rice farms. Overall it consists of 1026 observations. The dataframe contains the following variables:
| variable | description | expressions |
|:----------:|:-----------------------------------------------:|:-------------------------:|
| id | unique identifier for a farm | unique id |
| size | total production area in hectares | 0.01 - 5.322 |
| status | status of property rights | "owner", "share", "mixed" |
| varieties | rice seed varietes | "trad", "high", "mixed" |
| bimas | bimas-status of the farmers | "no", "yes", "mixed" |
| seed | seed in kilogram | 1 - 1250 kg |
| urea | urea in kilogram | 1 - 1250 kg |
| phosphate | phosphate in kilogram | 0 - 700 kg |
| pesticide | pesticide cost in Rupiah | 0 - 62600 r |
| pseed | price of seed in Rupiah per kg | 40 - 375 r/kg |
| purea | price of urea in Rupiah per kg | 50 - 100 r/kg |
| pphosph | price of phosphate in Rupiah per kg | 60 - 120 r/kg |
| hiredlabor | hired labor in hours | 1 - 4536 h |
| famlabor | family labor in hours | 1 - 1526 h |
| totlabor | total labor (excluding harvest labor) | 1 - 4774 h |
| wage | labor wage in Rupiah per hour | 30 - 175.35 r/h |
| goutput | gross output of rice in kg | 42 - 20960 kg |
| noutput | gross output minus harvesting cost | 42 - 17610 kg |
| price | price of rough rice in Rupiah per kg | 50 - 190 r/kg |
| region | region of the farm | unique region |
As present in the table, the data set consists of 16 numeric variables and 4 categorical variables. The target variable for the regression modeling will be *goutput*, what represents the gross output of rice in *kg* for the respective rice farm.
In the following some explorative data analysis will be made to get to get a first impression of the distribution of the individual variables.
## 1.1 Numerical Variables
The following figure shows boxplots for the used materials and the prices paid for the materials of the respective rice farms. The boxplots for the materials show, that the distribution of all materials is right-skewed. The spread width of seed is the lowest, followed by phosphate and urea. Therefore *urea* also has the highest variance with 16166 followed by *phosphate* with 2264 and *seed* with 2048. The distribution of *urea* indicates that rice farms in Indonesia may use urea very different, caused by e.g. the bimas-status. The bimas program is a rice intensification program by the government to support local rice production by providing high-yield rice seeds as well as technical assistance.
If we look at the prices for phosphate *pphosph* and urea *purea*, we can see a slight left-skewed distribution with low variance (75 for *purea* and 86 for *pphosph*). In contrast to that, the prices for seeds scatter much. The distribution of *pseed* is strongly right-skewed as well as the distribution for the rice price *price*. The price for the rice also scatters, but less than *pseed*. The two prices have a correlation of 0.67. Of course, the price of seeds affects the selling price of rice. The prices may fluctuate due to seasonal or regional factors and have an impact on each other.
The distribution of labor hours is also slightly skewed to the right. Overall, the dispersion is lowest for the *famlabor*. For *hiredlabor* and *totlabor* we have a similar spread, but *totlabor* has a higher level overall. This is caused by the *hiredlabor* which is a subset of *totlabor*.
\newpage
```{r, fig.width=10, fig.height=9}
df_boxplot_materials = data.frame(
feature=c(rep("seed", 1026), rep("urea", 1026),rep("phosphate", 1026)),
material=c(RiceFarms$seed, RiceFarms$urea, RiceFarms$phosphate))
bpl_materials = ggplot(df_boxplot_materials, aes(x=feature, y=material, fill=feature)) +
geom_boxplot() +
ylim(0, 600) +
ggtitle('Distribution of materials') +
theme(plot.title = element_text(hjust = 0.5))
df_boxplot_price_materials = data.frame(
feature=c(rep("price seed", 1026), rep("price urea", 1026),rep("price phosphate", 1026), rep("price rice", 1026)),
rupiah_per_kg=c(RiceFarms$pseed, RiceFarms$purea, RiceFarms$pphosph, RiceFarms$price))
bpl_price_materials = ggplot(df_boxplot_price_materials, aes(x=feature, y=rupiah_per_kg, fill=feature)) +
geom_boxplot() +
ggtitle('Distribution of material prices') +
theme(plot.title = element_text(hjust = 0.5))
# + ylim(0, 400)
df_boxplot_labor = data.frame(
labor=c(rep("hired labor", 1026), rep("family labor", 1026),rep("total labor", 1026)),
hours=c(RiceFarms$hiredlabor, RiceFarms$famlabor, RiceFarms$totlabor))
bpl_labor = ggplot(df_boxplot_labor, aes(x=labor, y=hours, fill=labor)) +
geom_boxplot() +
ggtitle('Distribution of labor hours') +
theme(plot.title = element_text(hjust = 0.5)) +
ylim(0,500)
grid.arrange(bpl_materials, bpl_price_materials, bpl_labor, ncol=1, nrow =3)
```
\newpage
## 1.1 Categorical Variables
The following mosaic plot shows the distribution of of the categorical variables *varietes*, *region* and *bimas*. Overall, all regions are roughly equally represented in the data set. We can detect, that most of the farmers with the bimas status *yes* and *mixed* are located in the region *ciwangi*. The distribution of the different varieties is strongly dependent on the region. While the *high* varieties have the biggest share in the regions *wargabinangun* and *langan*, the *traditional* varieties are dominating the regions *gunungwangi*, *malausma* and *ciwangi*. The *mixed* varities are only used slightly in all regions.
\
\
\
```{r, fig.width=7, fig.height=5}
df_cat = RiceFarms[,c("region", "varieties", "bimas")]
test = ggplot(df_cat) +
ggtitle("Mosaic Plot for varieties, region and bimas") +
theme(plot.title = element_text(hjust = 0.5)) +
geom_mosaic(aes(x = product(varieties, region), fill=bimas, offset = 0.2))
breaks = ggplot_build(test)$layout$panel_params[[1]]$x$get_breaks()
labels=c("wargabinangun","", "","","langan", "","gunungwangi","", "",
"malausma","", "","sukaambit","", "",
"ciwangi","","")
ggplot(df_cat) +
ggtitle("Mosaic Plot for varieties, region and bimas") +
theme(plot.title = element_text(hjust = 0.5)) +
geom_mosaic(aes(x = product(varieties, region), fill=bimas, offset = 0.2)) +
scale_x_productlist(breaks=breaks, labels=labels) +
xlab("region")
```
\
\
\
To test wheter the categorical variables have impact on our target variable *goutput*, one- and two-sided anovas are performed. The results of these are summarized in the following table:
\
\
| formula | F-value | p-value | significant |
|:----------------------:|:-------:|:--------:|:-----------:|
| region | 22.981 | < 2e-16 | yes |
| varieties | 11.764 | 8.94e-06 | yes |
| bimas | 14.817 | 4.57e-07 | yes |
| region+varietes | 3.847 | 3.96e-05 | yes |
| region+bimas | 5.651 | 2.94e-08 | yes |
| varieties+bimas | 0.791 | 0.531 | no |
| region+varieties+bimas | 0.860 | 0.580 | no |
The anova outputs show, that all of the categorical variables have a significant effect on *goutput*. The null hypothesis, that the mean of *goutput* is the same across the groups is rejected. The results of the two-sided anovas also show a significant interaction effect on *goutput*. While the interaction effect from the *region* with *varieties* and *bimas* is significant, the interaction effect of *varieties* and *bimas* and the interaction effect of all three variables is not.
## 1.3 Variable selection and transformation
The performance of the regression modeling is highly dependent of the variable selection and transformation. Therefore a suitable choice is very important.
The variable *noutput* is a linear transformation of *goutput* as it represents *goutput* decreased by the harvesting costs. Therefore it is not used for the modeling because it would violate the multicollinearity assumption. \
The variable *size* also correlates *strongly* with the target variable. This can be intuitively explained by the fact that a larger rice field naturally always produces a higher yield. Since the variables *seed*, *urea*, *phosphate* and *pesticide* are dependent on size, they are transformed into per-hectare sizes by dividing them with the respective hectare size of the farm. \
The variables *famlabor* and *hiredlabor* are subsets of the variable *totlabor* and are therefore transformed into the share of *totlabor* by dividing them with the amount of *totlabor*. The variable *totlabor* is after that transformed to a per-hectare size by dividing it with the *size*.
The variable *wage* follows a bimodal distribution. Therefore it is transformed into a binary variable, which indicates if the respective value is over or under 100.
## 1.4 Model evaluation
The data set will be splitted in 60-20-20 parts, where 60% of the data is used for training the model, and 20% for testing and validating respectively. In the modeling part, also cross-validation is used. To evaluate the models and compare them, different metrics will be used. The numeric metrics used are the *MSE*, which stands for the mean squared error and the *AIC*, which stands for the Akaike information criterion. Beside these metrics, also graphical analysis plots like a residual plots are used for evaluation.
```{r}
#reorder levels
factor_bimas <- c("no", "mixed", "yes")
RiceFarms$bimas <- factor(RiceFarms$bimas, levels = factor_bimas)
factor_varieties <- c("trad", "mixed", "high")
RiceFarms$varieties <- factor(RiceFarms$varieties, levels = factor_varieties)
```
3 Outliers most likely due to typos for observation 110, 947 and 1004 have been identified and manually adjusted.
```{r echo=FALSE}
#outliers
#RiceFarms[RiceFarms$seed>1000,]
#RiceFarms[RiceFarms$id==102220,]
RiceFarms$seed[110] <- 125 #instead of 1250
#RiceFarms[RiceFarms$pesticide>60000,]
#RiceFarms[RiceFarms$id==607168,]
RiceFarms$pesticide[947] <- 6260 #instead of 62600
#outlier?
#RiceFarms[RiceFarms$totlab_size>14000,]
#RiceFarms[RiceFarms$id==609241,]
RiceFarms$size[1004] <- 0.1 #instead of 0.01
```
```{r}
#divide by size
RiceFarms$seed_size <- RiceFarms$seed/RiceFarms$size
RiceFarms$phosph_size <- RiceFarms$phosphate/RiceFarms$size
RiceFarms$urea_size <- RiceFarms$urea/RiceFarms$size
RiceFarms$totlab_size <- RiceFarms$totlabor/RiceFarms$size
RiceFarms$pest_size <- RiceFarms$pesticide/RiceFarms$size
```
```{r}
RiceFarms$fam_ratio <- RiceFarms$famlabor/RiceFarms$totlabor
```
```{r}
#wage
#plot(wage,goutput)
RiceFarms$wage_cat <- ifelse(RiceFarms$wage > 100, "<100", ">100")
RiceFarms$wage_cat <- factor(RiceFarms$wage_cat)
```
```{r}
#Split the data into a 60-20-20 split for training, validation and testing.
set.seed(20211207)
n <- nrow(RiceFarms)
train <- sample(1:n,0.8*n)
test <- setdiff(1:n,train)
set.seed(20211207)
val <- sample(train,length(test))
train <- setdiff(train,val)
```
```{r}
MSE <- function(model, data, split=train){
pred.split <- predict(model, newdata = data[split,])
return(mean((exp(pred.split)-RiceFarms$goutput[split])^2))
}
```
\newpage
# 2 First Model
The first model to present and evaluate is called Lasso Regression.
## 2.1 Lasso Regression
As in most of the regression types, it minimizes the residual sum of squares (short: RSS). But in addition to that a penalty term is included in the formula which shrinks some parameters coefficient estimates to zero.
$$
R S S+\lambda \sum_{j=1}^{p}\left|\widehat{\beta}_{j}\right|
$$
,where $\lambda$ is a hyperparameter, p are the parameters and $\beta$ the parameter coefficient estimates. The sum of the coefficents starts at $j=1$ because the $\beta_0$ is no parameter coefficient estimate, it is the bias of the model. Furthermore the absolute values of the estimates are calculated and summarized. As you can see from the equation, the lasso regression turns into a simple linear regression if the $\lambda$ is zero. \newline
The goal and intention of the lasso regression is to create a sparse model which makes it easier to interpret.
## 2.2 Feature Selection
Before starting with the actual regression, we can investigate which of the features could be important for predicting the goutput of the data. This step is helpful as the number of features in the dataset is over twenty. \newline
The method used for selecting the variables is the forward selection. The package leaps provides therefore a function regsubsets for linear models. It takes the target variable and the features of the dataset as input and gives some hints which variables could fit best to predict the goutput. The used algorithm optimizes the Mallows $C_p$ statistics which is related to the AIC. (James,p.79) \newline
There are more input parameters available for the function, i.e. the maximum size of a subset, the weight vector, the number of the best subsets or the method of variable selection (i.e. forward selection, backward selection, etc.). \newline
The following graphics show the results of the analysis.
```{r}
feature_selection<-function(data){
regfit.full<-regsubsets(goutput~.,data,nvmax=15)
reg.sum<<-summary(regfit.full)
dat<-data.frame(rss=reg.sum$rss, adjr2=reg.sum$adjr2,cp=reg.sum$cp,bic=reg.sum$bic)
rss_plot<- ggplot(dat)+
aes(x=seq(1:length(rss)),y=rss)+
geom_line()+
xlab('Number of Variables')+
ylab('RSS')
adj_r2_max = which.max(reg.sum$adjr2) # 11
adj_r_plot<-ggplot(dat)+
aes(x=seq(1:length(adjr2)),y=adjr2)+
geom_line()+
geom_point(aes(x=adj_r2_max,y=adjr2[adj_r2_max]))+
xlab('Number of Variables')+
ylab('Adjusted RSq')
cp_min = which.min(dat$cp)
cp_plot<-ggplot(dat)+
aes(x=seq(1:length(cp)),y=cp)+
geom_line()+
geom_point(aes(x=cp_min,y=cp[cp_min]))+
xlab('Number of Variables')+
ylab('Cp')
bic_min = which.min(reg.sum$bic)
bic_plot<-ggplot(dat)+
aes(x=seq(1:length(bic)),y=bic)+
geom_line()+
geom_point(aes(x=bic_min,y=bic[bic_min]))+
xlab('Number of Variables')+
ylab('BIC')
patch<-(rss_plot + adj_r_plot)/(cp_plot + bic_plot)
patch + plot_annotation(title='Feature Selection')
}
####3.
feature_selection(RiceFarms[train,])
```
**Explanation:**
The graphic show the the RSS, the adjusted $R^2$, the $C_p$ statistics and the BIC of the models. \newline
The metrics help to identify the overall best models of the problem. Each of the metrics show that in general a 3-variable model could be enough as the metrics are getting slightly better as the number of variables increase. Just for the BIC it can be seen that after 7 variables the metric gets worse.\newline
By investigating the output of the regsubsets, it shows which of the variables are selected to give the best results.
```{r}
#reg.sum
```
The best 3-variable model is selecting size, phosphate and totlabor as the best performing variables. These variables are also in every other greater model. Therefore they are remembered when searching for a optimal model for the lasso regression. Also some other variables like pesticide, variety, wage and region are likely to have an influence on the model. \newline
## 2.3 Preprocessing
As a specific package for the lasso regression, the glmnet package, is used, the data needs to be preprocessed. One important step therefore is to transform the qualititative variables like factor variables to dummy variables so the model can use them. The method is relatively simple by creating extra features with binary values. model.matrix is doing this transformation automatically by creating a design matrix out of the data frame. It also needs the information which variables to transform for which an expression is needed. This gives the opportunity to select the wished variables and also to do mathematical transformation like logarithm or polynomial conversion before applying and getting the model matrix for the model.\newline
In addition to that a scaling option is implemented to test if the model is better when scaling the data.
```{r}
prepare_x<-function(expr,scaler=FALSE,center=FALSE){
if(scaler){
x<-scale(model.matrix(expr,RiceFarms)[,-1],center=center) #creates a matrix and transforms qualitative variables to dummy variables
}
else{
x<-model.matrix(expr,RiceFarms)[,-1] #creates a matrix and transforms qualitative variables to dummy variables
}
return(x)
}
x<-prepare_x(goutput~.)
y<-log(RiceFarms$goutput)
```
## 2.4 Training and Evaluation
As the input features are transformed properly the lasso regression can be trained. After training the function returns a list of models. This is because of the hyperparameter $\lambda$. This allows us to fine tune the model and improve its performance. To do this, a simple 10-fold-cross-validation on the validation data is applied which is also included in the package glmnet. But before starting the validation a grid of possible lambdas is prepared to have the possibility to change the spectrum of the lambda parameter. Afterwards a plot is obtained showing the mean cross-validated error depending on the $\lambda$. The dotted lines in the plot are the $\lambda$ which minimizes the mean cross-validated error the most and the $\lambda$ which gives most regularized model such that the cross-validated error is within one standard error. In this task the $\lambda$ with the minimum mean cross-validated error is chosen. \newline
To evaluate the selected model on the validation data with the chosen $\lambda$ the metrics MSE, BIC, AIC, AICc (modification of AIC as a correction for small sample sizes) and $R^2$ are used. \newline
As for the glmnet package no known implemented function is found the metrics BIC,AIC, AICc and $R^2$ are implemented therefore manually. The formulas used are:
$$
\mathrm{BIC}=\chi^{2}+k \ln (n)
$$
with
$$
\mathbf{X}^{2}=\text { Null deviance }-\text { Residual deviance }
$$
and k as the number of parameters estimated by the model and n the number of observations.
$$
\mathrm{AIC}=-\chi^{2}+ 2k
$$
$$
\mathrm{AICc}= AIC+\frac{2k(k+1)}{n-k-1}
$$
and
$$
\begin{aligned}
R^{2} &=1-\frac{\text { sum squared regression (SSR) }}{\text { total sum of squares (SST) }} \\
&=1-\frac{\sum\left(y_{i}-\hat{y}_{i}\right)^{2}}{\sum\left(y_{i}-\bar{y}\right)^{2}}
\end{aligned}
$$
where $y$ are the actual values, $\hat{y}$ the predicted values and $\bar{y}$ the mean value of the $y$.
```{r}
BICAIC<-function(fit){
tLL <- fit$nulldev - deviance(fit)
k <- fit$df
n <- fit$nobs
AICc <- -tLL+2*k+2*k*(k+1)/(n-k-1)
AIC_ <- -tLL+2*k
BIC<-log(n)*k - tLL
return(list('AIC'=AIC_,'BIC'=BIC,'AICc'=AICc))
}
```
```{r}
prepare_grid<-function(grid_param=c(0,-4,100)){
grid <- 10^ seq(grid_param[1],grid_param[2], length =grid_param[3])
return(list('grid'=grid))
}
train_lasso_with_feature<-function(x,y,grid){
#Fit train and predict on test set to get the MSE
lasso.mod <-glmnet(x[train,],y[train],alpha=1, lambda =grid , thresh =1e-12)
lasso.pred<-predict(lasso.mod ,s=1, newx=x[val,])#s is the lambda because we have a grid of lasso.models
#cat('----MSE----:',mean((lasso.pred -y[val])^2))#MSE
return(lasso.mod)
}
train_lasso_cv<-function(lasso.mod,x,y,grid){
##Use Cross-Validation to find the best lambda
set.seed(1)
cv.out <-cv.glmnet(x[val,],y[val],lambda=grid,alpha =1) #does 10-fold-CV as default
#plot(cv.out)
bestlam <-cv.out$lambda.min
best.model<-glmnet(x[train,],y[train],alpha=1, lambda =bestlam , thresh =1e-12)
res<-BICAIC(best.model)
#Now what is the test MSE with this best lambda
lasso.pred.train<-predict(lasso.mod ,s=bestlam ,newx=x[train,])
lasso.pred<-predict(lasso.mod ,s=bestlam ,newx=x[val,])
mse.train<-mean((lasso.pred.train -y[train])^2)
mse.val<-mean((lasso.pred -y[val])^2)
rss<-(lasso.pred-y[val])^2
#plot(rss)
sst <- sum((y[val] - mean(y[val]))^2)
sse <- sum((lasso.pred - y[val])^2)
#find R-Squared
rsq <- 1 - sse/sst
return(list('bestlam'=bestlam,'BIC'=res,'RSS'=rss,'RSq'=rsq,'FITTED'=lasso.pred,'df'=best.model$df,'mse.val'=mse.val,'mse.train'=mse.train))
}
```
## 2.5 Model Selection
After setting the environment and the criterions of the training and evaluating of models, the best model can be selected out of the possible feature subsets. As indicated at the beginning of the Feature Selection section, the variable size, totlabor and phosphate are used to start with. However, this is also done sequentially to obtain the metrics of each model. For the other variables mentioned, this preselection is continued. Additional mathematical transformations of the variables are also considered and included. In order to provide as much variation as possible, the variables are also compared with their size-scaled correspondents. The results of the evaluation are stored in a table and assessed.
```{r}
prep<-prepare_grid()
x<-prepare_x(goutput~log(size)+log(seed))
lasso.model.1<-train_lasso_with_feature(x,y,prep$grid)
fitted.1<-train_lasso_cv(lasso.model.1,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+seed)
lasso.model.2<-train_lasso_with_feature(x,y,prep$grid)
fitted.2<-train_lasso_cv(lasso.model.2,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+seed_size)
lasso.model.3<-train_lasso_with_feature(x,y,prep$grid)
fitted.3<-train_lasso_cv(lasso.model.3,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size))
lasso.model.4<-train_lasso_with_feature(x,y,prep$grid)
fitted.4<-train_lasso_cv(lasso.model.4,x,y,prep$grid)
####log(seed_size) better
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+log(totlabor))
lasso.model.5<-train_lasso_with_feature(x,y,prep$grid)
fitted.5<-train_lasso_cv(lasso.model.5,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlabor)
lasso.model.6<-train_lasso_with_feature(x,y,prep$grid)
fitted.6<-train_lasso_cv(lasso.model.6,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+log(totlab_size))
lasso.model.7<-train_lasso_with_feature(x,y,prep$grid)
fitted.7<-train_lasso_cv(lasso.model.7,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size)
lasso.model.8<-train_lasso_with_feature(x,y,prep$grid)
fitted.8<-train_lasso_cv(lasso.model.8,x,y,prep$grid)
###normal totlab_size better
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+urea)
lasso.model.9<-train_lasso_with_feature(x,y,prep$grid)
fitted.9<-train_lasso_cv(lasso.model.9,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea))
lasso.model.10<-train_lasso_with_feature(x,y,prep$grid)
fitted.10<-train_lasso_cv(lasso.model.10,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+urea_size)
lasso.model.11<-train_lasso_with_feature(x,y,prep$grid)
fitted.11<-train_lasso_cv(lasso.model.11,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea_size))
lasso.model.12<-train_lasso_with_feature(x,y,prep$grid)
fitted.12<-train_lasso_cv(lasso.model.12,x,y,prep$grid)
####log(urea) better
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat)
lasso.model.13<-train_lasso_with_feature(x,y,prep$grid)
fitted.13<-train_lasso_cv(lasso.model.13,x,y,prep$grid)
####better than before
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+phosphate)
lasso.model.14<-train_lasso_with_feature(x,y,prep$grid)
fitted.14<-train_lasso_cv(lasso.model.14,x,y,prep$grid)
####worse than before
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed)+log(totlabor)+log(urea)+log(phosphate+1))
lasso.model.15<-train_lasso_with_feature(x,y,prep$grid)
fitted.15<-train_lasso_cv(lasso.model.15,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed)+log(totlabor)+log(urea)+phosph_size)
lasso.model.16<-train_lasso_with_feature(x,y,prep$grid)
fitted.16<-train_lasso_cv(lasso.model.16,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed)+log(totlabor)+log(urea)+log(phosph_size+1))
lasso.model.17<-train_lasso_with_feature(x,y,prep$grid)
fitted.17<-train_lasso_cv(lasso.model.17,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region)
lasso.model.18<-train_lasso_with_feature(x,y,prep$grid)
fitted.18<-train_lasso_cv(lasso.model.18,x,y,prep$grid)
####better than before
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+pesticide)
lasso.model.19<-train_lasso_with_feature(x,y,prep$grid)
fitted.19<-train_lasso_cv(lasso.model.19,x,y,prep$grid)
```
```{r}
####better
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+log(pesticide+1))
lasso.model.20<-train_lasso_with_feature(x,y,prep$grid)
fitted.20<-train_lasso_cv(lasso.model.20,x,y,prep$grid)
####slightly better
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+pest_size)
lasso.model.21<-train_lasso_with_feature(x,y,prep$grid)
fitted.21<-train_lasso_cv(lasso.model.21,x,y,prep$grid)
###better
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+log(pest_size+1))
lasso.model.22<-train_lasso_with_feature(x,y,prep$grid)
fitted.22<-train_lasso_cv(lasso.model.22,x,y,prep$grid)
###worse
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+pest_size+price)
lasso.model.23<-train_lasso_with_feature(x,y,prep$grid)
fitted.23<-train_lasso_cv(lasso.model.23,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+pest_size+log(price))
lasso.model.24<-train_lasso_with_feature(x,y,prep$grid)
fitted.24<-train_lasso_cv(lasso.model.24,x,y,prep$grid)
####slightly better
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+pest_size+log(price)+fam_ratio)
lasso.model.25<-train_lasso_with_feature(x,y,prep$grid)
fitted.25<-train_lasso_cv(lasso.model.25,x,y,prep$grid)
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+pest_size+log(price)+I(sqrt(-log(fam_ratio))))
lasso.model.26<-train_lasso_with_feature(x,y,prep$grid)
fitted.26<-train_lasso_cv(lasso.model.26,x,y,prep$grid)
####worse
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+pest_size+log(price)+varieties)
lasso.model.27<-train_lasso_with_feature(x,y,prep$grid)
fitted.27<-train_lasso_cv(lasso.model.27,x,y,prep$grid)
####better
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+pest_size+log(price)+varieties+bimas)
lasso.model.28<-train_lasso_with_feature(x,y,prep$grid)
fitted.28<-train_lasso_cv(lasso.model.28,x,y,prep$grid)
####worse
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+pest_size+log(price)+varieties+purea)
lasso.model.29<-train_lasso_with_feature(x,y,prep$grid)
fitted.29<-train_lasso_cv(lasso.model.29,x,y,prep$grid)
####better
```
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat+region+pest_size+log(price)+varieties+log(purea))
lasso.model.30<-train_lasso_with_feature(x,y,prep$grid)
fitted.30<-train_lasso_cv(lasso.model.30,x,y,prep$grid)
####better
```
```{r}
options(tinytex.verbose = TRUE)
lassos <- seq(1:30)
l <- length(lassos)
var <- rep(NA, l)
df <- rep(NA, l)
MSE.train <- rep(NA, l)
MSE.val <- rep(NA, l)
aic <- rep(NA, l)
aicc<-rep(NA, l)
bic<- rep(NA, l)
r_sq<- rep(NA, l)
for (i in 1:l){
fitted<-get(eval(paste0('fitted.',lassos[[i]],sep='')))
model.name<-get(eval(paste0('lasso.model.',lassos[[i]],sep='')))
var[i] <- dimnames(tail(coefficients(model.name,s=fitted$bestlam), n=1))[[1]]
df[i] <- fitted$df
MSE.train[i] <- round(fitted$mse.train,digits=4)
MSE.val[i] <- round(fitted$mse.val,digits=4)
aic[i] <- round(fitted$BIC$AIC,1)
aicc[i] <- round(fitted$BIC$AICc,1)
bic[i] <- round(fitted$BIC$BIC,1)
r_sq[i] <- round(fitted$RSq,digits=4)
}
model_perf <- data.frame(var, df, MSE.train, MSE.val, aic, aicc, bic, r_sq)
#colnames(model_perf) <- c("var","df","MSE.train","MSE.val","dev","aic","p_val_p", "p_val_np", "df_np")
#model_perf$var[17]<- "s(pphosph)+s(purea)"
#model_perf$var[22:23] <- c("bimas", "bimas+varieties", "bimas+status", "bimas+region")
#model_perf$var[25] <- "s(purea)+varieties"
kbl(model_perf, caption = "Lasso comparison for variable selection", longtable = TRUE,booktabs=TRUE) %>%
kable_styling(latex_options = c("hold_position", "repeat_header","striped"),stripe_index=c(5:8,13,18,23:24,27,29:30)) %>%
kable_paper("hover", full_width = FALSE) %>%
pack_rows("size+seed", 1, 4,latex_gap_space="1em") %>%
pack_rows("size+seed+totlabor", 5, 8,latex_gap_space="1em") %>%
pack_rows("size+seed+totlabor+urea", 9, 12,latex_gap_space="1em") %>%
pack_rows("size+seed+totlabor+urea+wage_cat", 13, 13,latex_gap_space="1em") %>%
pack_rows("size+seed+totlabor+urea+phosphate", 14, 17,latex_gap_space="1em") %>%
pack_rows("size+seed+totlabor+urea+wage_cat+region", 18, 18,latex_gap_space="1em") %>%
pack_rows("size+seed+totlabor+urea+wage_cat+region+pesticide", 19, 22,latex_gap_space="1em") %>%
pack_rows("size+seed+totlabor+urea+wage_cat+region+pesticide+price", 23, 24,latex_gap_space="1em") %>%
pack_rows("size+seed+totlabor+urea+wage_cat+region+pesticide+price+fam_ratio", 25, 26,latex_gap_space="1em") %>%
pack_rows("size+seed+totlabor+urea+wage_cat+region+pesticide+price+varieties", 27, 27,latex_gap_space="1em") %>%
pack_rows("size+seed+totlabor+urea+wage_cat+region+pesticide+price+varieties+bimas", 28, 28,latex_gap_space="1em") %>%
pack_rows("size+seed+totlabor+urea+wage_cat+region+pesticide+price+varieties+purea", 29, 30,latex_gap_space="1em")
```
\newpage
**Table columns:**
* var: the variable that is added
* df: degrees of freedom
* MSE.train: MSE on training data
* MSE.val: MSE on validation data
* aic: Akaike information criterion
* aicc: Akaike information criterion + penalty term
* bic: Bayesian information criterion
* r_sq: R-squared
As described in the feature selection, the RSS and the $R^2$ are improving with the increase of features selected. However the other metrics (AIC,BIC and AICc) are behaving like the feature selection predicts. By increasing the number of features after approximately 6 the metrics are generally getting worse. This means that the number of features does not justify the improvement of the RSS and the $R^2$. The penalty for the more complex model is higher than it has a positive effect. Therefore the smaller model is preferred. \newline
```{r}
lassos <- seq(1:30)
l <- length(lassos)
var <- rep(NA, l)
df <- rep(NA, l)
MSE.train <- rep(NA, l)
MSE.val <- rep(NA, l)
aic <- rep(NA, l)
aicc<-rep(NA, l)
bic<- rep(NA, l)
r_sq<- rep(NA, l)
for (i in 1:l){
fitted<-get(eval(paste0('fitted.',lassos[[i]],sep='')))
model.name<-get(eval(paste0('lasso.model.',lassos[[i]],sep='')))
var[i] <- dimnames(tail(coefficients(model.name,s=fitted$bestlam), n=1))[[1]]
MSE.train[i] <- round(fitted$mse.train,digits=4)
MSE.val[i] <- round(fitted$mse.val,digits=4)
bic[i] <- round(fitted$BIC$BIC,1)
aicc[i] <- round(fitted$BIC$AICc,1)
aic[i] <- round(fitted$BIC$AIC,1)
r_sq[i] <- round(fitted$RSq,digits=4)
}
model_perf <- data.frame(MSE.train, MSE.val, bic,aic,aicc,r_sq, var,row.names = var)
df.long <- pivot_longer(model_perf, cols=1:6, names_to = "metric", values_to = "value")
df.long$var <- factor(df.long$var, levels =var)
mse_plot <- ggplot(df.long[df.long$metric %in% c('MSE.train','MSE.val'),])+
aes(x= var, y=value, group=metric, color = metric)+
geom_line()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.title.x = element_blank(), legend.position = c(0.8, 0.8))
bic_plot <- ggplot(df.long[df.long$metric %in% c("bic","aic","aicc"),])+
aes(x=var, y=value, group=metric, color=metric)+
geom_line()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.title.x = element_blank(), legend.position = c(0.8, 0.8))
r_sq_plot<-ggplot(df.long[df.long$metric=="r_sq",])+
aes(x=var, y=value, group=1)+
geom_line()+
ylab("R-squared")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.title.x = element_blank())
patch1 <- r_sq_plot + mse_plot
patch1 + plot_annotation(
title = 'Model performance'
)
bic_plot + plot_annotation(title='Model metrics')
```
The best model to be chosen is:
$$
log(g) = \beta_1*log(s) + \beta_2*log(e) + \beta_3*t + \beta_4 *log(u) + \beta_5 * w + \beta_0
$$
where :
* g: goutput
* s: size
* e: seed
* t: totallabor
* u: urea
* w: wage_cat
* $\beta_i$: coefficients and intercept
The best model also allows to look at its lambdas and the coefficients.
```{r}
x<-prepare_x(goutput~log(size)+log(seed_size)+totlab_size+log(urea)+wage_cat)
lasso.model.13<-train_lasso_with_feature(x,y,prep$grid)
fitted.13<-train_lasso_cv(lasso.model.13,x,y,prep$grid)
beta=coef(lasso.model.13)
tmp <- as.data.frame(as.matrix(beta))
tmp$coef <- row.names(tmp)
tmp <- reshape::melt(tmp, id = "coef")
tmp$variable <- as.numeric(gsub("s", "", tmp$variable))
tmp$lambda <- lasso.model.13$lambda[tmp$variable+1] # extract the lambda values
tmp$norm <- apply(abs(beta[-1,]), 2, sum)[tmp$variable+1] # compute L1 norm
# x11(width = 13/2.54, height = 9/2.54)
ggplot(tmp[tmp$coef != "(Intercept)",], aes(lambda, value, color = coef, linetype = coef)) +
geom_line() +
geom_vline(xintercept=fitted.13$bestlam)+
scale_x_log10() +
xlab("Lambda (log scale)") +
guides(color = guide_legend(title = ""),
linetype = guide_legend(title = "")) +
theme_bw() +
theme(legend.key.width = unit(3,"lines"))
```
\newpage
# 3 Second Model
The second model to present and evaluate is called Generalized Additive Model (GAM)
## 3.1 Generalized Additive Model (GAM)
Generalized additive models (GAMs) provide a general framework for extending a standard linear model by allowing non-linear functions of each of the variables, while maintaining additivity. Just like linear models, GAMs additivity can be applied with both quantitative and qualitative responses.
GAMs allow us to fit a non-linear predictor $f_{j}$ to each variable $x_{ij}$, so that we will find non-linear relationships that standard multiple linear regression will miss. We do not need to manually try out many different transformations on each variable individually. The general GAM formula is as follows:
$$
Y_{i}=\beta_{0}+f_{1}\left(x_{i 1}\right)+f_{2}\left(x_{i 2}\right)+\ldots+f_{p}\left(x_{i p}\right)+\epsilon_{i}
$$
There is no constraint that each $f_{j}$ has to be the same type of function.
So $f_{1}$ could be a quadratic function, $f_{2}$ a smoothing spline function and $f_{3}$ a
loess function. And the smoothness of the function $f_{j}$ for the variable $X_{j}$ can be controlled independently for each variable.
* **Regression splines** are more flexible than polynomials and step functions, and in fact are an extension of the two. They involve dividing the range of X into K distinct regions. Within each region, a polynomial function is fit to the data. However, these polynomials are constrained so that they join smoothly at the region boundaries, or knots.
* **Smoothing splines** are similar to regression splines, but arise in a slightly different situation. Smoothing splines result from minimizing a residual sum of squares criterion subject to a smoothness penalty.
* **Local regression** is similar to splines, but differs in an important way. The regions are allowed to overlap, and indeed they do so in a very smooth way.
A GAM is restricted to be additive. Important interactions might be missed, but we can manually add an interaction term to the GAM model by adding a predictor for $X_{j}X_{k}$. Fitting a GAM with a smoothing spline is not quite as simple as fitting a GAM with a natural spline, since in the case of smoothing splines, least squares cannot be used. However, standard software such as the gam() function from the gam library in R can be used to fit GAMs using smoothing splines, via an approach known as backfitting. So there is an important difference between smoothing splines and naturals splines. In the first case what you are fitting is a penalized spline model while in the second just regression splines, i.e. splines without penalty.
The s() function, which is part of the gam library, is used to indicate that we would like to use a smoothing spline. Qualitative variables are automatically converted into dummy variables by the gam function according to the amount of levels they have.
**Explanation of GAM visualization**
The lower visualization is an example of our 2nd GAM including two variables, size and totlab_size. For both of them we are using a smoothing spline function together with a log transformation. Because the model is additive, we can examine the effect of each $f_{j}$ on $Y$ individually while holding all of the other variables fixed. Both panels from below have the same vertical scale. This allows us to visually assess the relative contributions of each of the variables. We observe that size and totlab_size have a large effect on goutput.
The left-hand panel indicates that holding totlab_size fixed, goutput increases with size and is very steep up to about a size of 0.5 hectar and then becomes more and more flat.
The right-hand panel indicates that holding size fixed, goutput increases drastically with the increased proportion of labour per size up to a proportion of about 500 hours/hectar and then flattens out. We can also see from the stripchart on the x-axis of the panel that most of the data are of smaller sizes up to 1 hectar and between 300 to 2000 hours of labor invested per hectar.
```{r}
#size
gam1 <- gam(log(goutput)~s(size), data = RiceFarms[train,])
gam2 <- gam(log(goutput)~s(log(size)), data = RiceFarms[train,])
```
```{r}
#par(mfrow=c(1,1))
#plot.Gam(gam2, residuals = TRUE, col="lightblue")
```
```{r fig.asp=0.5}
#labour
gam3 <- gam(log(goutput)~s(log(size))+s(log(totlab_size)), data = RiceFarms[train,])
gam4 <- gam(log(goutput)~s(log(size))+s(totlab_size), data = RiceFarms[train,])
par(mfrow=c(1,2))
plot.Gam(gam3, terms = "s(log(size))", residuals = TRUE, col="lightblue")
title("Example GAM Visualization", font.main= 1)
plot.Gam(gam3, terms = "s(log(totlab_size))", residuals = TRUE, col="lightblue")
```
```{r}
#urea
gam5 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(log(urea_size)), data = RiceFarms[train,])
gam5.1 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(urea_size), data = RiceFarms[train,])
#anova(gam4, gam5, gam5.1) #use gam5 with log(urea) slightly better
#par(mfrow=c(1,3))
#plot.Gam(gam5, residuals = TRUE, col="lightblue")
```
```{r}
#phosphor
gam6 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(log(urea_size))+
s(log(phosph_size+1)), data = RiceFarms[train,])
gam6.1 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(log(urea_size))+
s(phosph_size), data = RiceFarms[train,])
#anova(gam5,gam6, gam7) #use gam6
```
```{r}
#seed
gam8 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(log(urea_size)) +
s(log(phosph_size+1))+
s(log(seed_size)), data = RiceFarms[train,])
gam9 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(log(urea_size)) +
s(log(phosph_size+1))+
s(seed_size), data = RiceFarms[train,])
#anova(gam6, gam8, gam9) #use gam9
```
```{r}
#pesticide
gam10 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(log(urea_size)) +
s(log(phosph_size+1))+
s(seed_size)+
s(pest_size), data = RiceFarms[train,])
#tail(summary(gam10)$parametric.anova$Pr, n=2)[1]
```
```{r}
gam10.1 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(log(urea_size)) +
s(log(phosph_size+1))+
s(seed_size)+
s(pest_size, df=1), data = RiceFarms[train,])
gam10.2 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(log(urea_size)) +
s(log(phosph_size+1))+
s(seed_size)+
pest_size, data = RiceFarms[train,])
#tail(summary(gam10.1)$parametric.anova$Pr, n=2)[1]
```
```{r}
#add price
gam11 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(log(urea_size)) +
s(log(phosph_size+1))+
s(seed_size)+
pest_size+
s(price), data = RiceFarms[train,])
#anova(gam10,gam11)
```
```{r}
#add fam_ratio
gam12 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(log(urea_size)) +
s(log(phosph_size+1))+
s(seed_size)+
pest_size+
s(price)+
s(fam_ratio), data = RiceFarms[train,])
gam12.1 <- gam(log(goutput)~
s(log(size))+
s(log(totlab_size))+
s(log(urea_size)) +
s(log(phosph_size+1))+
s(seed_size)+
pest_size+
s(price)+
s(fam_ratio, df=13), data = RiceFarms[train,])
#par(mfrow=c(2,4))
#plot.Gam(gam12.1, residuals = TRUE, col="lightblue")