diff --git a/vignettes/03_analysis_modelling.Rmd b/vignettes/03_analysis_modelling.Rmd index 38735c7..f46988a 100644 --- a/vignettes/03_analysis_modelling.Rmd +++ b/vignettes/03_analysis_modelling.Rmd @@ -24,6 +24,7 @@ library(ggplot2) library(gridExtra) library(grid) library(tidyverse) +library(ggpubr) data(iris) ``` @@ -32,11 +33,11 @@ data(iris) ## What we cover ->- Linear Regression ->- Multiple Linear Regression ->- Logistic Regression +- Linear Regression +- Multiple Linear Regression +- Logistic Regression -```{r echo=FALSE, error=FALSE, message=FALSE, warning=FALSE, 'class="centre", out.extra=, style="width:, warnings=FALSE} +```{r echo=FALSE, error=FALSE, message=FALSE, warning=FALSE, out.extra = 'class="centre" style="width: 500px;"', warnings=FALSE} setwd("/Users/srauschert/Desktop/Work/20.) Git_GitHub/RWorkshop/") tki_demo <- read_csv("data/demo.csv") @@ -58,9 +59,9 @@ ggplot( aes(day2, day3)) + In a linear regression, we aim to find a model:
->- that represents our data and +- that represents our data and ->- can give information about the association between our variables of interest. +- can give information about the association between our variables of interest. The command in R for a linear model is
@@ -73,19 +74,19 @@ The Iris data set consists of information about three different species of iris It holds information on: ->- Sepal length +- Sepal length ->- Sepal width +- Sepal width ->- Petal length +- Petal length ->- Petal width +- Petal width ## Data set summary Let's first have a look at the summary table of the Iris data set, by using the summary() command: -```{r, echo = FALSE, results='asis',out.extra = 'class="centre" style="width: 500px;"'} -kable(summary(iris[,c(1:4)])) +```{r, echo = FALSE, results='asis',out.extra = 'class="centre" style="width: 100px;"',warning=FALSE} +kable(summary(tki_demo[,c(6:8)])) ``` #Visualisation of data distributions @@ -93,38 +94,52 @@ kable(summary(iris[,c(1:4)])) Before we start with the linear regression model, we need to get an idea of the underlying data and its distribution. We know that the linear regression has the assumtptions: ->- +- ## QQ-plot: -```{r, echo=FALSE, out.extra = 'class="centre" style="width: 700px;"'} +```{r, echo=FALSE, out.extra = 'class="centre" style="width: 700px;"', warning=FALSE} library(tidyr) -data(iris) -iris_long <- gather(iris, Specification, measurement, Sepal.Length:Petal.Width, factor_key=TRUE) -ggplot(iris_long, aes(sample=measurement, color=Specification))+stat_qq() +tki_demo %>% + filter(day2 < 100) %>% + gather(Days, measurement, day1:day3, factor_key=TRUE) %>% + ggplot( aes(sample=measurement, color=Days))+stat_qq() ``` ## Boxplots to check for outliers -```{r echo = FALSE, out.extra = 'class="centre" style="width: 700px;"'} +```{r echo = FALSE, out.extra = 'class="centre" style="width: 700px;"',warning=FALSE} - +with_out <- tki_demo %>% + #filter(day2 < 100) %>% + gather(Days, measurement, day1:day3, factor_key=TRUE) %>% + ggplot(aes(y=measurement,x=Days, fill=Days)) + + labs(title = "Days: 1 to 3 with outlier", x = "", y = "Measurment") + + geom_boxplot() + + scale_color_telethonkids("light") + + theme_minimal() -ggplot(iris_long, aes(y=measurement,x=Specification, col=Specification)) + - labs(title = "Iris Specifications", x = "", y = "Measurment in cm") + +no_out <- tki_demo %>% + filter(day2 < 100) %>% + gather(Days, measurement, day1:day3, factor_key=TRUE) %>% + ggplot(aes(y=measurement,x=Days, fill=Days)) + + labs(title = "Days: 1 to 3 outlier removed", x = "", y = "Measurment") + geom_boxplot() + scale_color_telethonkids("light") + theme_minimal() + +ggarrange(with_out, no_out, ncol=2, common.legend = TRUE, legend=FALSE ) + ``` ## Plot the variables -```{r, echo = FALSE, out.extra = 'class="centre" style="width: 700px;"'} +```{r, echo = FALSE, out.extra = 'class="centre" style="width: 700px;"',warning=FALSE} data(iris) plot1 <- ggplot(iris, aes(Petal.Width, Petal.Length)) + labs(title = "Petal", x = "Petal Width", y = "Petal Length") + @@ -164,11 +179,11 @@ Let's now perform a linear regression model in R. lm(Petal.Length~Petal.Width, data=iris) ->- As said before, the first argument in the code is **y**, our outcome variable or dependent variable. In this case it is **Petal.Length**. +- As said before, the first argument in the code is **y**, our outcome variable or dependent variable. In this case it is **Petal.Length**. ->- The second Argument is **x**, the independent variable. In our case: **Petal.Width**. +- The second Argument is **x**, the independent variable. In our case: **Petal.Width**. ->- We also specify the data set that holds the variables we specified as **x** and **y**. +- We also specify the data set that holds the variables we specified as **x** and **y**. ##Linear Regression Results Now we want to look at the results of the linear regression. So how do we get the p-value and \(\beta\)-coefficient for the association?