diff --git a/R/03_data_manipulation.R b/R/02_data_manipulation.R similarity index 93% rename from R/03_data_manipulation.R rename to R/02_data_manipulation.R index 8a3df25..08db053 100644 --- a/R/03_data_manipulation.R +++ b/R/02_data_manipulation.R @@ -1,245 +1,247 @@ -#' 03_data_manipulation.R -#' This is the lab file for the Telethon Kids Institiute Introduction to R -#' workshop session Data Manipulation. -#' Last updated by Paul Stevenson on 11 April 2019 -#' - - - - - -#### Running code ---- - -#' To execute a section of code, highlight the desired chunk and press Ctrl+Enter -#' To execute the entire R Script, press Ctrl+Shift+Enter - - - - - -#### Import libraries ---- - -# the full tidyverse can loaded with library(tidyverse) - -library(dplyr) -library(lubridate) -library(readr) -library(tidyr) - - - - - -#### Getting help ---- - -#' if at any point you need help with a function, type "?" into -#' the console, for example: ?read_csv - - - - - -#### Read in data ---- - -# Read in a CSV files - -read_csv("data/demo.csv") - -# Read in a RData file - notice that this can contain multiple data sets - -load("data/dat.RData") - - - - - -#### Expand on classes (data/time [lubridate - year], string processing) ---- - -# Vectors - single class - -vec <- c("One", "Two", "Three") - -# data frame of vectors that may have different classes - -df <- tibble( - character = vec, - integer = 1:3, - numeric = integer * 1.0, - logical = c(T, T, F), - factor = factor(c("dog", "cat", "Dog"), levels = c("dog", "Dog", "cat", "Cat"), labels = c("Woof", "Woof", "Meow", "Meow")), - date = ymd(c("2019-04-11", "2019-05-11", "2019-06-11")), - missing = NA -) - -# Summary of data set - -str(df) - -# Find column class - -class(df$date) - -# Dates - lubridate can import dates in several different formats -# All dates should be in the same format! - -dmy("16/02/1985") -mdy("Feb 16 1985") -ymd("1985-February 16") - -ymd_hms("1985/02/05 12:30:00", tz = "Australia/Perth") - -# format can be used to change the date format - for example printing in tables/figuers - -format(tki_demo$dob, "%d %b %Y") %>% - head() - - - - -#### Summarise/tidy (cleaning) ---- - -summary(tki_demo) - - - - -#### Select ---- - -tki_demo %>% - select(id, dob, intervention) - -tki_demo %>% - select(-dob, -day1) - - - - - -#### Filter ---- - -tki_demo %>% - filter(dob > ymd("2005-01-01"), smoker, intervention == "Drug 2") - - - - - -#### Mutate (case_when, if/else) ---- - -tki_demo %>% - mutate(age = interval(dob, Sys.Date()) %>% as.duration() %>% as.numeric("years"), # using the lubridate package - teenager = ifelse(age >= 12, T, F), # ifelse - age_cat = case_when( # case_when (nested if/else) - age < 5 ~ "Younger than 5 years old", - age < 10 ~ "5 - 9 years old", - age < 15 ~ "10 - 14 years old", - age >= 15 ~ "Older than 15 years", - T ~ NA_character_ - )) %>% - select(id, dob, age, teenager, age_cat) - - - - - -#### Subset/merge/join ---- - -tki_demo_join <- left_join(tki_demo, - tki_demo_complications, - by = "id") - -tki_demo_join %>% - filter(!is.na(complications)) - - - - - -#### Gather/spread/melt/cast/separate/reshape ---- - -# Gather - wide to long - -tki_demo_long <- tki_demo %>% - gather(key = day, value = score, -id, -dob, -intervention, -male, -smoker) - -# Spread - long to wide (back to original format) - -tki_demo_long %>% - spread(key = day, value = score) - - - - - -#### Summarise ---- - -tki_demo %>% - summarise(n = n(), - day1_mean = mean(day1, na.rm = T), - day2_median = median(day2, na.rm = T), - day3_sd = sd(day3, na.rm = T)) - -# Summarise by a single group - -tki_demo %>% - group_by(intervention) %>% - summarise(mean = mean(day1, na.rm = T), - sd = sd(day1, na.rm = T)) - - - - - -# Summarise by multiple groups - -tki_demo %>% - group_by(intervention, smoker) %>% - summarise(mean = mean(day1, na.rm = T), - sd = sd(day1, na.rm = T)) - - - - - -#### Functions in R ---- - -adder <- function(x, y, z) x + y + z - -adder(5, 17, -1) - - - - - -#### Applying functions (apply/tidyverse) ---- - -tki_demo %>% - mutate(total = adder(day1, day2, day3)) - -# apply the same function to multiple columns - -square <- function(x) x^2 - -tki_demo %>% - mutate_at(c("day1", "day2", "day3"), list(~square(.))) - -# apply function to columns that meet a criteria - -tki_demo %>% - mutate_if(is.double, list(~square(.))) - -# apply a function to data frame subsets - -tki_demo %>% - split(f = tki_demo$intervention) %>% - lapply(function(x) { - # x is each split element of the data frame, which gets acted on one at a time - # the last item is returned - - x2 <- x %>% - mutate(new = ifelse(male & smoker, day1, day2 + day3)) - - x2 - - }) %>% - bind_rows() # combine split data frame back into one +#' 03_data_manipulation.R +#' This is the lab file for the Telethon Kids Institiute Introduction to R +#' workshop session Data Manipulation. +#' Last updated by Paul Stevenson on 11 April 2019 +#' + + + + + +#### Running code ---- + +#' To execute a section of code, highlight the desired chunk and press Ctrl+Enter +#' To execute the entire R Script, press Ctrl+Shift+Enter + + + + + +#### Import libraries ---- + +# the full tidyverse can loaded with library(tidyverse) + +library(dplyr) +library(lubridate) +library(readr) +library(tidyr) + + + + + +#### Getting help ---- + +#' if at any point you need help with a function, type "?" into +#' the console, for example: ?read_csv + + + + + +#### Read in data ---- + +# Read in a CSV files + +read_csv("data/demo.csv") + +# Read in a RData file - notice that this can contain multiple data sets + +load("data/dat.RData") + + + + + +#### Expand on classes (data/time [lubridate - year], string processing) ---- + +# Vectors - single class + +vec_character <- c("One", "Two", "Three") + +vec_integer <- c(1, 2, 3) + +vec_logical <- c(T, T, F) + +# data frame of vectors that may have different classes + +df <- tibble( + character = c("One", "Two", "Three"), + integer = 1:3, + numeric = integer * 1.0, + logical = c(T, T, F), + factor = factor(c("dog", "cat", "Dog"), levels = c("dog", "Dog", "cat", "Cat"), labels = c("Woof", "Woof", "Meow", "Meow")), + date = ymd(c("2019-04-11", "2019-05-11", "2019-06-11")), + missing = NA +) + +# Vector class + +class(vec_logical) + +class(df$date) + +# Summary of data set + +str(df) + +# Dates - lubridate can import dates in several different formats +# All dates should be in the same format! + +dmy("16/02/1985") +mdy("Feb 16 1985") +ymd("1985-February 16") + +ymd_hms("1985/02/05 12:30:00", tz = "Australia/Perth") + +# format can be used to change the date format - for example printing in tables/figuers + +format(tki_demo$dob, "%d %b %Y") %>% + head() + + + + +#### Summarise/tidy (cleaning) ---- + +summary(tki_demo) + + + + +#### Filter ---- + +tki_demo %>% + filter(dob > ymd("2005-01-01"), smoker, intervention == "Drug 2") + + + + + +#### Select ---- + +tki_demo %>% + select(id, dob, intervention) + +tki_demo %>% + select(-dob, -day1) + + + + + +#### Mutate (case_when, if/else) ---- + +tki_demo %>% + mutate(age = interval(dob, Sys.Date()) %>% as.duration() %>% as.numeric("years"), # using the lubridate package + teenager = ifelse(age >= 12, T, F), # ifelse + age_cat = case_when( # case_when (nested if/else) + age < 5 ~ "Younger than 5 years old", + age < 10 ~ "5 - 9 years old", + age < 15 ~ "10 - 14 years old", + age >= 15 ~ "Older than 15 years", + T ~ NA_character_ + )) %>% + select(id, dob, age, teenager, age_cat) + + + + + +#### Subset/merge/join ---- + +tki_demo_join <- left_join(tki_demo, + tki_demo_complications, + by = "id") + +tki_demo_join %>% + filter(!is.na(complications)) + + + + + +#### Gather/spread/melt/cast/separate/reshape ---- + +# Gather - wide to long + +tki_demo_long <- tki_demo %>% + gather(key = day, value = score, -id, -dob, -intervention, -male, -smoker) + +# Spread - long to wide (back to original format) + +tki_demo_long %>% + spread(key = day, value = score) + + + + + +#### Summarise ---- + +tki_demo %>% + summarise(n = n(), + day1_mean = mean(day1, na.rm = T), + day2_median = median(day2, na.rm = T), + day3_sd = sd(day3, na.rm = T)) + +# Summarise by a single group + +tki_demo %>% + group_by(intervention) %>% + summarise(mean = mean(day1, na.rm = T), + sd = sd(day1, na.rm = T)) + +# Summarise by multiple groups + +tki_demo %>% + group_by(intervention, smoker) %>% + summarise(mean = mean(day1, na.rm = T), + sd = sd(day1, na.rm = T)) + + + + + +#### Functions in R ---- + +adder <- function(x, y, z) x + y + z + +adder(5, 17, -1) + + + + + +#### Applying functions (apply/tidyverse) ---- + +tki_demo %>% + mutate(total = adder(day1, day2, day3)) + +# apply the same function to multiple columns + +square <- function(x) x^2 + +tki_demo %>% + mutate_at(c("day1", "day2", "day3"), list(~square(.))) + +# apply function to columns that meet a criteria + +tki_demo %>% + mutate_if(is.double, list(~square(.))) + +# apply a function to data frame subsets + +tki_demo %>% + split(f = tki_demo$intervention) %>% + lapply(function(x) { + # x is each split element of the data frame, which gets acted on one at a time + # the last item is returned + + x2 <- x %>% + mutate(new = ifelse(male & smoker, day1, day2 + day3)) + + x2 + + }) %>% + bind_rows() # combine split data frame back into one diff --git a/vignettes/02_data_manipulation.Rmd b/vignettes/02_data_manipulation.Rmd index 0e3630f..c089781 100644 --- a/vignettes/02_data_manipulation.Rmd +++ b/vignettes/02_data_manipulation.Rmd @@ -17,295 +17,556 @@ vignette: > --- ```{r init, include = FALSE, echo = FALSE} +library(knitr) library(biometrics) library(lubridate) library(tidyverse) library(kableExtra) + +source("assets/R/hooks.R") + +load("../data/dat.RData") + ``` -## Session resources +## Session Resources + +All resouces for this Introduction to R Workshp are available on GitHub: [https://github.com/TelethonKids/RWorkshop](https://github.com/TelethonKids/RWorkshop) -[https://github.com/TelethonKids/RWorkshop](https://github.com/TelethonKids/RWorkshop) +**Introduction to Data Manipulation** -[Worked example](https://github.com/TelethonKids/RWorkshop/tree/master/R/03_data_manipulation) -[Slides](https://github.com/TelethonKids/RWorkshop/tree/master/inst/doc/reproducable-research.html) +Download the slides from this link: [https://github.com/TelethonKids/RWorkshop/tree/master/inst/doc/reproducable-research.html](https://github.com/TelethonKids/RWorkshop/tree/master/inst/doc/reproducable-research.html) -## Data manipulation (FD 2h) - PS +Download the Worked example from here: [https://github.com/TelethonKids/RWorkshop/tree/master/R/03_data_manipulation](https://github.com/TelethonKids/RWorkshop/tree/master/R/03_data_manipulation) -- Overview of base v tidyverse [history/evolution] -- Read in data -- Expand on classes (data/time [lubridate - year], string processing) -- Summarise/tidy (cleaning) -- Create columns/conditional create (case_when, if/else) -- Subset/merge/join -- Gather/spread/melt/cast/separate/reshape -- Functions -- Applying functions (apply/tidyverse) +# Session Overview +## Overview + +>- Overview of base *vs.* tidyverse [history/evolution] +>- Read in data +>- Expand on classes (data/time [lubridate - year], string processing) +>- Summarise/tidy (cleaning) +>- Create columns/conditional create (case_when, if/else) +>- Subset/merge/join +>- Gather/spread/melt/cast/separate/reshape +>- Functions +>- Applying functions (apply/tidyverse) # Tidyverse ## What is the Tidyverse? -![https://www.tidyverse.org/](assets/images/tidyverse.PNG) -
The tidyverse is an opinionated collection of R packages designed for data science. All packages share an underlying design philosophy, grammar, and data structures.
[https://www.tidyverse.org/](https://www.tidyverse.org/)
+  -Install the complete tidyverse with: +Install the complete Tidyverse collection in R (or RStudio) with: install.packages("tidyverse") -## Overview of base v tidyverse [history/evolution] +## Tidyverse Packages -# Worked example +
-## Files +```{r, echo = F, out.extra = "figure"} +include_graphics("assets/images/tidyverse.PNG") -Data is available from GitHub repository ...... +``` -## Read in raw data +>- **dplyr** (Data wrangling) +>- **ggplot2** (Data visualisation) +>- **readr** (Import raw data formats) +>- **tibble** (Adds to `data.frame`) +>- **tidyr** (Functions to reshape data) +>- **purrr** (Functional programming tools) - library(readr) - - read_csv("/path/to/file.csv", options ...) - -## read_csv() options - -```{r read_csv_options, echo = F} -tibble( - Option = c("col_names", "na", "trim_ws", "skip", "n_max", "guess_max", "skip_empty_rows"), - Description = c("If TRUE, the first row of the input will be used as the column names, and will not be included in the data frame.", - "Character vector of strings to interpret as missing values.", - "Should leading and trailing whitespace be trimmed from each field before parsing it?", - "Number of lines to skip before reading data.", - "Maximum number of records to read.", - "Maximum number of records to use for guessing column types.", - "Should blank rows be ignored altogether?")) %>% - mutate(Option = paste0("
", Option, "
"), - Description = paste0("
", Description, "
")) %>% # add tag to all options - kable("html", escape = F) %>% - kable_styling("hover", full_width = F) +
-``` +## Overview of base *vs.* tidyverse [history/evolution] -More help can be found by typing `?read_delim` into the R console. +TO DO +# Introduction to Data\nManipulation -## Other ways to read in data +## Worked Example - load("path/to/file.RData") - +Download the Worked example from here: [https://github.com/TelethonKids/RWorkshop/tree/master/R/03_data_manipulation](https://github.com/TelethonKids/RWorkshop/tree/master/R/03_data_manipulation) + +or type the code yourself! + +## Library Packages + + library(dplyr) + library(lubridate) library(readr) - read_rds("path/to/file.rds") + library(tidyr) + +# Reading In Data + +## Raw Data | Example + +`readr` has packages to import common data files. + + read_csv("data/demo.csv") + +## Raw Data | Options + + ?read_delim + +
+ +**file**
+**delim**
+quote = "\""
+escape_backslash = FALSE
+escape_double = TRUE
+col_names = TRUE
+col_types = NULL
+locale = default_locale()
+**na = c("", "NA")**
+quoted_na = TRUE
+comment = ""
+trim_ws = FALSE
+skip = 0
+n_max = Inf,
+**guess_max = min(1000, n_max)**
+progress = show_progress()
+skip_empty_rows = TRUE) + +
+ +## R Data Formats + + load("data/demo.RData") + + readRDS("path/to.file") + +RData files can store multiple datasets/variables; .rds files hold only one. + +## Other Types of Data + +Excel files library(readxl) read_xlxs("path/to/file.xlsx") +Stata + library(readstata13) readstata13("path/to/file.sta") - Foreign library (SAS, SPSS, ...) +Databases: refer to the `odbc` package + +Other data types - Google it! + +# Data Classes + +## Common Classes + +>- Numeric (numbers) +>- Character (letters) +>- Logical (true/false) +>- Factor (structured groupings) +>- Date (date and time data) + +>- Missing data is repestended by **NA** (there is only one missing data code in R) + +## Vector + + vec_character <- c("One", "Two", "Three") + + vec_integer <- c(1, 2, 3) -Data can also be read directly from a database into R! + vec_logical <- c(T, T, F) -## Data frame (tibble) +Vectors hold a collection of data of the **same** class. -A data frame is a table (or 2 dimensional array-like object) where each column -stores the data of one variable and each row represents a single record. +## Data Frame (AKA Tibble) -These are the properties of a data frame: +A data frame is a table, or 2 dimensional array-like object. For tidy data, +each column represents a variable and each row represents a single record. -* All columns should have a unique name -* All data in one column will be the same class, however each column can be different -* Each column should have the same number of data items/rows -* Rows are not named +Data frames are a collection of vectors. -Vectors of data are stored in columns: +In a data frame: - c("A", "B", "C", "D") +>- all columns should have a unique name +>- all data in one column will be the same class, however column can be different +>- each column should be the same length +>- rows are not named. -## Expand on classes (data/time [lubridate - year], string processing) +## Example of a Data Frame | Syntax -* Numeric -* Character -* Logical -* Factor -* Date + df <- tibble( + character = c("One", "Two", "Three"), + integer = 1:3, + numeric = integer * 1.0, + logical = c(T, T, F), + factor = factor(c("dog", "cat", "Dog"), + levels = c("dog", "Dog", "cat", "Cat"), + labels = c("Woof", "Woof", "Meow", "Meow")), + date = ymd(c("2019-04-11", "2019-05-11", "2019-06-11")), + missing = NA + ) -* Missing data - NA (there is only one missing data code in R) +## Example of a Data Frame | Looks like -# Summarise/tidy (cleaning) + # A tibble: 3 x 7 + character integer numeric logical factor date missing + + 1 One 1 1 TRUE Woof 2019-04-11 NA + 2 Two 2 2 TRUE Meow 2019-05-11 NA + 3 Three 3 3 FALSE Woof 2019-06-11 NA -```{r data} -load("../data/dat.RData") +## Vector Class + + class(vec_logical) + + [1] "logical" + +  + + class(df$date) + + [1] "Date" + +## Data Frame Structure + + str(df) + + Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 3 obs. of 7 variables: + $ character: chr "One" "Two" "Three" + $ integer : int 1 2 3 + $ numeric : num 1 2 3 + $ logical : logi TRUE TRUE FALSE + $ factor : Factor w/ 2 levels "Woof","Meow": 1 2 1 + $ date : Date, format: "2019-04-11" "2019-05-11" "2019-06-11" + $ missing : logi NA NA NA + +## Dates | Input + + library(lubridate) + +>- Dates - multiple formats (all items in vectors should be consistent) + + dmy("16/02/1985") + mdy("Feb 16 1985") + ymd("1985-February 16") + + [1] "1985-02-16" + +>- Date and time + + ymd_hms("1985/02/05 12:30:00", tz = "Australia/Perth") + +The defult timezone is Universal Time Coordinated (UTC/GMP). + +## Dates | Output + +```{r date_output_format, comment = NA} +format(tki_demo$dob, "%d %b %Y") %>% + head() ``` -```{r str_data} -str(tki_demo) +## Dates | Format Options + +```{r date_format_options, echo = F, include = F} +format_options <- tibble(code = c("%a", "%b", "%c", "%H", "%j", "%M", "%S", "%W", + "%x", "%y", "%z", "%A", "%B", "%d", "%I", "%m", + "%p", "%U", "%W", "%X", "%Y", "%Z"), + Description = c("Abbreviated weekday", "Abbreviated month","Locale-specific date and time", + "Decimal hours (24 hour)", "Decimal day of the year", "Decimal minute", + "Decimal second", "Decimal Weekday (0=Sunday)", "Locale-specific Date", + "2-digit year", "Offset from GMT", "Full weekday", + "Full month", "Decimal date", "Decimal hours (12 hour)", "Decimal month", + "Locale-specific AM/PM", "Decimal week of the year (starting on Sunday)", + "Decimal week of the year (starting on Monday)", "Locale-specific Time", + "4-digit year", "Time zone (character)"), + level = c(rep.int(1, 11), rep.int(2, 11))) ``` -## Data summary - high level descriptives +
+ +| Code | Description | Code | Description | +|------|-------------|------|-------------| +| %a | Abbreviated weekday | %A | Full weekday | +| %b | Abbreviated month | %B | Full month | +| %c | Locale-specific date and time | %d | Decimal date | +| %H | Decimal hours (24 hour) | %I | Decimal hours (12 hour) | +| %j | Decimal day of the year | %m | Decimal month | +| %M | Decimal minute | %p | Locale-specific AM/PM | +| %S | Decimal second | %U | Decimal week of the year (starting on Sunday) | +| %W | Decimal Weekday (0=Sunday) | %W | Decimal week of the year (starting on Monday) | +| %X | Locale-specific Date | %X | Locale-specific Time | +| %y | 2-digit year | %Y | 4-digit year | +| %z | Offset from GMT | %Z | Time zone (character) | + +
+ +# Cleaning and Data Wrangling -```{r summary_data} +## High Level Data Summary + +```{r summary_data, comment = NA} summary(tki_demo) ``` -## Looking at the data +## Pipe Operator "%>%" + + library(dplyr) + +>- Allows a constant flow of data from one function to the next +>- The output data of the previous function is generally assumed to be the first argument +>- Helpful to think of `dplyr` as a series of verbs that are piped together + +> Data frame %>% filter() %>% select() %>% mutate() %>% summarise() %>% view() -```{r head_data} -head(tki_demo) +## Filter + +```{r filter, comment = NA} +tki_demo %>% + filter(dob > ymd("2005-01-01"), smoker, intervention == "Drug 2") %>% + head() ``` -## Create columns/conditional create (if/else) +## Select | Including + +```{r select_include, comment = NA} +tki_demo %>% + select(id, dob, intervention) %>% + head() -New column can be created with `mutate()` +``` + +## Select | Excluding -```{r ifelse} +```{r select_exclude, comment = NA} tki_demo %>% - mutate(age = interval(dob, Sys.Date()) %>% as.duration() %>% as.numeric("years") %>% round(1), - teenager = ifelse(age >= 13, T, F)) %>% + select(-dob, -day1) %>% + head() + +``` + +## Mutate | ifelse + +```{r mutate_ifelse, eval = F} +tki_demo %>% + mutate(age = interval(dob, Sys.Date()) %>% as.duration() %>% as.numeric("years"), + teenager = ifelse(age >= 12, T, F)) %>% select(id, dob, age, teenager) %>% head() ``` -## case_when() +## Mutate | ifelse -```{r case_when, eval = F} +```{r mutate_ifelse_2, echo = F, comment = NA} tki_demo %>% - mutate(age = interval(dob, Sys.Date()) %>% - as.duration() %>% - as.numeric("years") %>% - round(1), - age_category = case_when( - age < 12 ~ "Younger than 12 years", - age < 14 ~ "12 - 13 years", - age < 16 ~ "14 - 15 years", - age < 18 ~ "15 - 17 years", - T ~ "Older than 18 years" - ), - factor(age_category, - levels = c("Younger than 12 years","12 - 13 years", - "14 - 15 years", "15 - 17 years", - "Older than 18 years"))) %>% - select(id, dob, age, age_category) %>% + mutate(age = interval(dob, Sys.Date()) %>% as.duration() %>% as.numeric("years"), + teenager = ifelse(age >= 12, T, F)) %>% + select(id, dob, age, teenager) %>% head() ``` -## case_when() +## Mutate | case_when -```{r case_when_output, echo = F} +```{r mutate_casewhen, eval = F} tki_demo %>% - mutate(age = interval(dob, Sys.Date()) %>% - as.duration() %>% - as.numeric("years") %>% - round(1), - age_category = case_when( - age < 12 ~ "Younger than 12 years", - age < 14 ~ "12 - 13 years", - age < 16 ~ "14 - 15 years", - age < 18 ~ "15 - 17 years", - T ~ "Older than 18 years" - ), - factor(age_category, - levels = c("Younger than 12 years","12 - 13 years", - "14 - 15 years", "15 - 17 years", - "Older than 18 years"))) %>% - select(id, dob, age, age_category) %>% + mutate(age = interval(dob, Sys.Date()) %>%as.duration() %>% as.numeric("years"), + age_cat = case_when( + age < 5 ~ "Younger than 5 years old", + age < 10 ~ "5 - 9 years old", + age < 15 ~ "10 - 14 years old", + age >= 15 ~ "Older than 15 years", + T ~ NA_character_ + )) %>% + select(id, dob, age, age_cat) + +``` + +## Mutate | case_when + +```{r mutate_casewhen_2, echo = F, comment = NA} +tki_demo %>% + mutate(age = interval(dob, Sys.Date()) %>%as.duration() %>% as.numeric("years"), + age_cat = case_when( + age < 5 ~ "Younger than 5 years old", + age < 10 ~ "5 - 9 years old", + age < 15 ~ "10 - 14 years old", + age >= 15 ~ "Older than 15 years", + T ~ NA_character_ + )) %>% + select(id, dob, age, age_cat) %>% head() ``` -## Parsing dates with lubridate +## Combining Multiple Data Sources | Join -* Run some examples - adding day if YYYYMM data provided -* Converting char to date -* example with date and time +```{r join, eval = F} +tki_demo %>% + left_join(tki_demo_complications, + by = "id") %>% + filter(!is.na(complications)) %>% + head() -# Subset/merge/join +``` -## needs fixing +Data can be joined on multiple columns that can have different names. -So far all the tools you’ve learned have worked with complete data frames. If you want to pull out a single variable, you need some new tools, $ and [[. [[ can extract by name or position; $ only extracts by name but is a little less typing. +## Combining Multiple Data Sources | Join -df <- tibble( - x = runif(5), - y = rnorm(5) -) +```{r join_2, echo = F, comment = NA} +tki_demo %>% + left_join(tki_demo_complications, + by = "id") %>% + filter(!is.na(complications)) %>% + head() -## Extract by name -df$x +``` + +# Summarising Data + +## Summarise + +```{r summarise, comment = NA} +tki_demo %>% + summarise(n = n(), + day1_mean = mean(day1, na.rm = T), + day2_median = median(day2, na.rm = T), + day3_sd = sd(day3, na.rm = T)) %>% + head() -df[["x"]] +``` +## Summarise | Single Group -## Extract by position -df[[1]] +```{r summarise_single_group, comment = NA} +tki_demo %>% + group_by(intervention) %>% + summarise(mean = mean(day1, na.rm = T), + sd = sd(day1, na.rm = T)) %>% + head() -To use these in a pipe, you’ll need to use the special placeholder .: +``` -df %>% .$x +## Summarise | Multiple Groups -df %>% .[["x"]] +```{r summarise_multiple_groups, eval = F} +tki_demo %>% + group_by(intervention, smoker) %>% + summarise(mean = mean(day1, na.rm = T), + sd = sd(day1, na.rm = T)) %>% + head() -# Gather/spread/melt/cast/separate/reshape +``` -## Long/wide +## Summarise | Multiple Groups -Gather/spread +```{r summarise_multiple_groups_2, echo = F, comment = NA} +tki_demo %>% + group_by(intervention, smoker) %>% + summarise(mean = mean(day1, na.rm = T), + sd = sd(day1, na.rm = T)) %>% + head() -## Functions +``` -Don't repeat yourself (DRY) +# Functions in R -Use a function to replace frequently used code -* Call on the function many times -* automate a function over a list with the apply() family of functions +## Defining and Calling a Function -```{r funciton} -square <- function(x) { - x^2 -} +```{r function, comment = NA} +adder <- function(x, y, z) x + y + z -square(4) +adder(5, 17, -1) ``` -## Applying functions (apply/tidyverse) +## Applying Functions | mutate -```{r, eval = F} -day1_mean <- function(x) { - tibble(id = x$id, - day1_mean = mean(x$day1, na.rm = T) - ) -} +```{r function_mutate, comment = NA} +tki_demo %>% + mutate(total = adder(day1, day2, day3)) %>% + head() + +``` + +## Applying Functions | mutate_at + +```{r function_mutate_at, eval = F} +square <- function(x) x^2 + +tki_demo %>% + mutate_at(c("day1", "day2", "day3"), list(~square(.))) + +``` + +Individually apply the same function to multiple data frame columns. + +## Applying Functions | mutate_at + +```{r function_mutate_at_2, echo = F, comment = NA} +square <- function(x) x^2 + +tki_demo %>% + mutate_at(c("day1", "day2", "day3"), list(~square(.))) %>% + head() +``` + +## Applying Functions | mutate_if + +ERROR TO DO FIX + +```{r function_mutate_if, eval = F, comment = NA} +tki_demo %>% + mutate_if(is.double, list(~square(.))) + +``` + +## Applying Functions Subset Data + +```{r function_split, eval = F} tki_demo %>% - split(.$id) %>% - lapply(day1_mean) %>% - bind_rows() + split(f = tki_demo$intervention) %>% + lapply(function(x) { + + x2 <- x %>% + mutate(new = ifelse(male & smoker, day1, day2 + day3)) + x2 + + }) %>% + bind_rows() %>% + head() ``` -## Applying functions (apply/tidyverse) +>- `lapply` can be easily parallelised for multi-core computing +>- *x* is each split element of the data frame, which gets acted on one at a time +the last item is returned +>- `bind_rows()` combines split data back into a single data frame. -```{r, echo = F} -day1_mean <- function(x) { - tibble(id = x$id, - day1_mean = mean(x$day1, na.rm = T) - ) -} +## Applying Functions Subset Data +```{r function_split_2, echo = F, comment = NA} tki_demo %>% - split(.$id) %>% - lapply(day1_mean) %>% - bind_rows() + split(f = tki_demo$intervention) %>% + lapply(function(x) { + + x2 <- x %>% + mutate(new = ifelse(male & smoker, day1, day2 + day3)) + x2 + + }) %>% + bind_rows() %>% + head() -``` \ No newline at end of file +``` diff --git a/vignettes/assets/css/ioslides.css b/vignettes/assets/css/ioslides.css index b52a906..0a816c4 100644 --- a/vignettes/assets/css/ioslides.css +++ b/vignettes/assets/css/ioslides.css @@ -11,7 +11,7 @@ em { font-style: italic; } -.centre { +.center { display: block; margin-left: auto; margin-right: auto;