diff --git a/DESCRIPTION b/DESCRIPTION index f87e27ad4..7f0e48337 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.10.0.5 +Version: 0.10.0.6 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NEWS.md b/NEWS.md index 7340b179d..7f56b8bc1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -23,6 +23,8 @@ CHANGES If you recode into a numeric variable, and one of the recode values is `NA`, you no longer need to use `NA_real_` for numeric `NA` values. +* Improved documentation for some functions. + BUG FIXES * `data_to_long()` did not work for data frame where columns had attributes diff --git a/R/data_read.R b/R/data_read.R index b02b7ca87..5137a7735 100644 --- a/R/data_read.R +++ b/R/data_read.R @@ -70,7 +70,7 @@ #' factors, where imported value labels will be set as factor levels. If a #' numeric variable has _no_ value labels or less value labels than values, it #' is not converted to factor. In this case, value labels are preserved as -#' `"labels"` attribute. Character vectors are preserved. Use +#' `"labels"` attribute. Character vectors are preserved. Use #' `convert_factors = FALSE` to remove the automatic conversion of numeric #' variables to factors. #' @@ -105,7 +105,7 @@ data_read <- function(path, por = .read_spss(path, encoding, convert_factors, verbose, ...), dta = .read_stata(path, encoding, convert_factors, verbose, ...), sas7bdat = .read_sas(path, path_catalog, encoding, convert_factors, verbose, ...), - .read_unknown(path, convert_factors, verbose, ...) + .read_unknown(path, file_type, convert_factors, verbose, ...) ) # tell user about empty columns @@ -178,20 +178,18 @@ data_read <- function(path, if (is.character(i)) { # we need this to drop haven-specific class attributes i <- as.character(i) - } else { + } else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) { # if all values are labelled, we assume factor. Use labels as levels - if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) { - if (is.numeric(i)) { - i <- factor(i, labels = names(value_labels)) - } else { - i <- factor(as.character(i), labels = names(value_labels)) - } - value_labels <- NULL - attr(i, "converted_to_factor") <- TRUE + if (is.numeric(i)) { + i <- factor(i, labels = names(value_labels)) } else { - # else, fall back to numeric - i <- as.numeric(i) + i <- factor(as.character(i), labels = names(value_labels)) } + value_labels <- NULL + attr(i, "converted_to_factor") <- TRUE + } else { + # else, fall back to numeric + i <- as.numeric(i) } # drop unused value labels @@ -290,12 +288,18 @@ data_read <- function(path, } -.read_unknown <- function(path, convert_factors, verbose, ...) { - insight::check_if_installed("rio", reason = paste0("to read files of type '", .file_ext(path), "'")) +.read_unknown <- function(path, file_type, convert_factors, verbose, ...) { + insight::check_if_installed("rio", reason = paste0("to read files of type '", file_type, "'")) if (verbose) { insight::format_alert("Reading data...") } - out <- rio::import(file = path, ...) + # set up arguments. for RDS, we set trust = TRUE, to avoid warnings + rio_args <- list(file = path) + # check if we have RDS, and if so, add trust = TRUE + if (file_type == "rds") { + rio_args$trust <- TRUE + } + out <- do.call(rio::import, c(rio_args, list(...))) # for "unknown" data formats (like .RDS), which still can be imported via # "rio::import()", we must check whether we actually have a data frame or @@ -310,9 +314,8 @@ data_read <- function(path, ) } return(out) - } else { - out <- tmp } + out <- tmp } .post_process_imported_data(out, convert_factors, verbose) diff --git a/R/data_restoretype.R b/R/data_restoretype.R index 9b5eb71a9..d4119f340 100644 --- a/R/data_restoretype.R +++ b/R/data_restoretype.R @@ -1,5 +1,6 @@ #' Restore the type of columns according to a reference data frame #' +#' @param data A data frame for which to restore the column types. #' @inheritParams data_to_long #' @inheritParams data_rename #' @param reference A reference data frame from which to find the correct diff --git a/R/data_to_long.R b/R/data_to_long.R index 3d19e5bc2..deffcc0cb 100644 --- a/R/data_to_long.R +++ b/R/data_to_long.R @@ -4,43 +4,101 @@ #' the number of columns. This is a dependency-free base-R equivalent of #' `tidyr::pivot_longer()`. #' -#' @param data A data frame to pivot. -#' @param names_to The name of the new column that will contain the column -#' names. +#' @param data A data frame to convert to long format, so that it has more +#' rows and fewer columns after the operation. +#' @param names_to The name of the new column (variable) that will contain the +#' _names_ from columns in `select` as values, to identify the source of the +#' values. `names_to` can be a character vector with more than one column name, +#' in which case `names_sep` or `names_pattern` must be provided in order to +#' identify which parts of the column names go into newly created columns. +#' See also 'Examples'. #' @param names_prefix A regular expression used to remove matching text from #' the start of each variable name. #' @param names_sep,names_pattern If `names_to` contains multiple values, this -#' argument controls how the column name is broken up. -#' `names_pattern` takes a regular expression containing matching groups, i.e. "()". -#' @param values_to The name of the new column that will contain the values of -#' the pivoted variables. +#' argument controls how the column name is broken up. `names_pattern` takes a +#' regular expression containing matching groups, i.e. "()". +#' @param values_to The name of the new column that will contain the _values_ of +#' the columns in `select`. #' @param values_drop_na If `TRUE`, will drop rows that contain only `NA` in the -#' `values_to` column. This effectively converts explicit missing values to -#' implicit missing values, and should generally be used only when missing values -#' in data were created by its structure. +#' `values_to` column. This effectively converts explicit missing values to +#' implicit missing values, and should generally be used only when missing values +#' in data were created by its structure. #' @param rows_to The name of the column that will contain the row names or row -#' numbers from the original data. If `NULL`, will be removed. +#' numbers from the original data. If `NULL`, will be removed. #' @param ... Currently not used. #' @inheritParams extract_column_names #' @param cols Identical to `select`. This argument is here to ensure compatibility -#' with `tidyr::pivot_longer()`. If both `select` and `cols` are provided, `cols` -#' is used. +#' with `tidyr::pivot_longer()`. If both `select` and `cols` are provided, `cols` +#' is used. +#' +#' @details +#' Reshaping data into long format usually means that the input data frame is +#' in _wide_ format, where multiple measurements taken on the same subject are +#' stored in multiple columns (variables). The long format stores the same +#' information in a single column, with each measurement per subject stored in +#' a separate row. The values of all variables that are not in `select` will +#' be repeated. +#' +#' The necessary information for `data_to_long()` is: +#' +#' - The columns that contain the repeated measurements (`select`). +#' - The name of the newly created column that will contain the names of the +#' columns in `select` (`names_to`), to identify the source of the values. +#' `names_to` can also be a character vector with more than one column name, +#' in which case `names_sep` or `names_pattern` must be provided to specify +#' which parts of the column names go into the newly created columns. +#' - The name of the newly created column that contains the values of the +#' columns in `select` (`values_to`). +#' +#' In other words: repeated measurements that are spread across several columns +#' will be gathered into a single column (`values_to`), with the original column +#' names, that identify the source of the gathered values, stored in one or more +#' new columns (`names_to`). #' #' @return If a tibble was provided as input, `reshape_longer()` also returns a #' tibble. Otherwise, it returns a data frame. #' #' @examplesIf requireNamespace("psych") && requireNamespace("tidyr") -#' wide_data <- data.frame(replicate(5, rnorm(10))) +#' wide_data <- setNames( +#' data.frame(replicate(2, rnorm(8))), +#' c("Time1", "Time2") +#' ) +#' wide_data$ID <- 1:8 +#' wide_data #' -#' # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:5)) +#' # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:3)) +#' # probably doesn't make much sense to mix "time" and "id" #' data_to_long(wide_data) #' #' # Customizing the names -#' data_to_long(wide_data, -#' select = c(1, 2), -#' names_to = "Column", -#' values_to = "Numbers", -#' rows_to = "Row" +#' data_to_long( +#' wide_data, +#' select = c("Time1", "Time2"), +#' names_to = "Timepoint", +#' values_to = "Score" +#' ) +#' +#' # Reshape multiple columns into long format. +#' mydat <- data.frame( +#' age = c(20, 30, 40), +#' sex = c("Female", "Male", "Male"), +#' score_t1 = c(30, 35, 32), +#' score_t2 = c(33, 34, 37), +#' score_t3 = c(36, 35, 38), +#' speed_t1 = c(2, 3, 1), +#' speed_t2 = c(3, 4, 5), +#' speed_t3 = c(1, 8, 6) +#' ) +#' # The column names are split into two columns: "type" and "time". The +#' # pattern for splitting column names is provided in `names_pattern`. Values +#' # of all "score_*" and "speed_*" columns are gathered into a single column +#' # named "count". +#' data_to_long( +#' mydat, +#' select = 3:8, +#' names_to = c("type", "time"), +#' names_pattern = "(score|speed)_t(\\d+)", +#' values_to = "count" #' ) #' #' # Full example @@ -48,21 +106,22 @@ #' data <- psych::bfi # Wide format with one row per participant's personality test #' #' # Pivot long format -#' data_to_long(data, +#' very_long_data <- data_to_long(data, #' select = regex("\\d"), # Select all columns that contain a digit #' names_to = "Item", #' values_to = "Score", #' rows_to = "Participant" #' ) +#' head(very_long_data) #' -#' data_to_long( +#' even_longer_data <- data_to_long( #' tidyr::who, #' select = new_sp_m014:newrel_f65, #' names_to = c("diagnosis", "gender", "age"), #' names_pattern = "new_?(.*)_(.)(.*)", #' values_to = "count" #' ) -#' +#' head(even_longer_data) #' @inherit data_rename #' @export data_to_long <- function(data, @@ -223,7 +282,11 @@ data_to_long <- function(data, # if columns in data frame have attributes (e.g. labelled data), `cbind()` # won't work, so we need to remove them. We'll set them back later not_stacked[] <- lapply(not_stacked, function(i) { - attributes(i) <- NULL + # we can't remove *all* attributes, this will convert factors into integers + attr(i, "label") <- NULL + attr(i, "labels") <- NULL + attr(i, "format.spss") <- NULL + class(i) <- setdiff(class(i), c("haven_labelled", "vctrs_vctr")) i }) diff --git a/R/data_to_wide.R b/R/data_to_wide.R index e7aa8f7d1..151140490 100644 --- a/R/data_to_wide.R +++ b/R/data_to_wide.R @@ -4,11 +4,20 @@ #' the number of rows. This is a dependency-free base-R equivalent of #' `tidyr::pivot_wider()`. #' -#' @param data A data frame to pivot. -#' @param id_cols The name of the column that identifies the rows. If `NULL`, -#' it will use all the unique rows. -#' @param names_from The name of the column that contains the levels to be -#' used as future column names. +#' @param data A data frame to convert to wide format, so that it has more +#' columns and fewer rows post-widening than pre-widening. +#' @param id_cols The name of the column that identifies the rows in the data +#' by which observations are grouped and the gathered data is spread into new +#' columns. Usually, this is a variable containing an ID for observations that +#' have been repeatedly measured. If `NULL`, it will use all remaining columns +#' that are not in `names_from` or `values_from` as ID columns. `id_cols` can +#' also be a character vector with more than one name of identifier columns. See +#' also 'Details' and 'Examples'. +#' @param names_from The name of the column in the original data whose values +#' will be used for naming the new columns created in the widened data. Each +#' unique value in this column will become the name of one of these new columns. +#' In case `names_prefix` is provided, column names will be concatenated with +#' the string given in `names_prefix`. #' @param names_prefix String added to the start of every variable name. This is #' particularly useful if `names_from` is a numeric vector and you want to create #' syntactic variable names. @@ -19,17 +28,37 @@ #' [glue specification](https://glue.tidyverse.org/index.html) that uses the #' `names_from` columns to create custom column names. Note that the only #' delimiters supported by `names_glue` are curly brackets, `{` and `}`. -#' @param values_from The name of the column that contains the values to be used -#' as future variable values. +#' @param values_from The name of the columns in the original data that contains +#' the values used to fill the new columns created in the widened data. #' @param values_fill Optionally, a (scalar) value that will be used to replace #' missing values in the new columns created. #' @param verbose Toggle warnings. #' @param ... Not used for now. #' -#' @return If a tibble was provided as input, `reshape_wider()` also returns a +#' @return If a tibble was provided as input, `data_to_wide()` also returns a #' tibble. Otherwise, it returns a data frame. #' -#' @examples +#' @details +#' Reshaping data into wide format usually means that the input data frame is +#' in _long_ format, where multiple measurements taken on the same subject are +#' stored in multiple rows. The wide format stores the same information in a +#' single row, with each measurement stored in a separate column. Thus, the +#' necessary information for `data_to_wide()` is: +#' +#' - The name of the column(s) that identify the groups or repeated measurements +#' (`id_cols`). +#' - The name of the column whose _values_ will become the new column names +#' (`names_from`). Since these values may not necessarily reflect appropriate +#' column names, you can use `names_prefix` to add a prefix to each newly +#' created column name. +#' - The name of the column that contains the values (`values_from`) for the +#' new columns that are created by `names_from`. +#' +#' In other words: repeated measurements, as indicated by `id_cols`, that are +#' saved into the column `values_from` will be spread into new columns, which +#' will be named after the values in `names_from`. See also 'Examples'. +#' +#' @examplesIf requireNamespace("lme4", quietly = TRUE) #' data_long <- read.table(header = TRUE, text = " #' subject sex condition measurement #' 1 M control 7.9 @@ -45,7 +74,7 @@ #' 4 M cond1 13.4 #' 4 M cond2 12.9") #' -#' +#' # converting long data into wide format #' data_to_wide( #' data_long, #' id_cols = "subject", @@ -53,6 +82,7 @@ #' values_from = "measurement" #' ) #' +#' # converting long data into wide format with custom column names #' data_to_wide( #' data_long, #' id_cols = "subject", @@ -62,13 +92,13 @@ #' names_sep = "." #' ) #' +#' # converting long data into wide format, combining multiple columns #' production <- expand.grid( #' product = c("A", "B"), #' country = c("AI", "EI"), #' year = 2000:2014 #' ) #' production <- data_filter(production, (product == "A" & country == "AI") | product == "B") -#' #' production$production <- rnorm(nrow(production)) #' #' data_to_wide( @@ -78,9 +108,59 @@ #' names_glue = "prod_{product}_{country}" #' ) #' +#' # using the "sleepstudy" dataset +#' data(sleepstudy, package = "lme4") +#' +#' # the sleepstudy data contains repeated measurements of average reaction +#' # times for each subjects over multiple days, in a sleep deprivation study. +#' # It is in long-format, i.e. each row corresponds to a single measurement. +#' # The variable "Days" contains the timepoint of the measurement, and +#' # "Reaction" contains the measurement itself. Converting this data to wide +#' # format will create a new column for each day, with the reaction time as the +#' # value. +#' head(sleepstudy) +#' +#' data_to_wide( +#' sleepstudy, +#' id_cols = "Subject", +#' names_from = "Days", +#' values_from = "Reaction" +#' ) +#' +#' # clearer column names +#' data_to_wide( +#' sleepstudy, +#' id_cols = "Subject", +#' names_from = "Days", +#' values_from = "Reaction", +#' names_prefix = "Reaction_Day_" +#' ) +#' +#' # For unequal group sizes, missing information is filled with NA +#' d <- subset(sleepstudy, Days %in% c(0, 1, 2, 3, 4))[c(1:9, 11:13, 16:17, 21), ] +#' +#' # long format, different number of "Subjects" +#' d +#' +#' data_to_wide( +#' d, +#' id_cols = "Subject", +#' names_from = "Days", +#' values_from = "Reaction", +#' names_prefix = "Reaction_Day_" +#' ) +#' +#' # filling missing values with 0 +#' data_to_wide( +#' d, +#' id_cols = "Subject", +#' names_from = "Days", +#' values_from = "Reaction", +#' names_prefix = "Reaction_Day_", +#' values_fill = 0 +#' ) #' @inherit data_rename seealso #' @export - data_to_wide <- function(data, id_cols = NULL, values_from = "Value", @@ -238,7 +318,7 @@ data_to_wide <- function(data, # stop if some column names would be duplicated (follow tidyr workflow) if (any(unstacked$col_order %in% current_colnames)) { insight::format_error( - "Some values of the columns specified in 'names_from' are already present as column names.", + "Some values of the columns specified in `names_from` are already present as column names.", paste0( "Either use `names_prefix` or rename the following columns: ", text_concatenate(current_colnames[which(current_colnames %in% unstacked$col_order)]) diff --git a/R/data_write.R b/R/data_write.R index 7e4e543d6..83457d64b 100644 --- a/R/data_write.R +++ b/R/data_write.R @@ -244,22 +244,20 @@ data_write <- function(data, value_labels <- value_labels[value_labels %in% unique(i)] # guess variable type - if (!is.character(i)) { + if (is.character(i)) { + # we need this to drop haven-specific class attributes + i <- as.character(i) + } else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) { # if all values are labelled, we assume factor. Use labels as levels - if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) { - if (is.numeric(i)) { - i <- factor(i, labels = names(value_labels)) - } else { - i <- factor(as.character(i), labels = names(value_labels)) - } - i <- as.character(i) + if (is.numeric(i)) { + i <- factor(i, labels = names(value_labels)) } else { - # else, fall back to numeric - i <- as.numeric(as.character(i)) + i <- factor(as.character(i), labels = names(value_labels)) } - } else { - # we need this to drop haven-specific class attributes i <- as.character(i) + } else { + # else, fall back to numeric + i <- as.numeric(as.character(i)) } # add back variable label attr(i, "label") <- variable_labels diff --git a/README.md b/README.md index 411ad4c72..54f91794d 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,9 @@ columns, can be achieved using `extract_column_names()` or # find column names matching a pattern extract_column_names(iris, starts_with("Sepal")) #> [1] "Sepal.Length" "Sepal.Width" +``` + +``` r # return data columns matching a pattern data_select(iris, starts_with("Sepal")) |> head() @@ -155,6 +158,9 @@ It is also possible to extract one or more variables: # single variable data_extract(mtcars, "gear") #> [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 4 4 3 3 3 3 3 4 5 5 5 5 5 4 +``` + +``` r # more variables head(data_extract(iris, ends_with("Width"))) @@ -215,11 +221,17 @@ x #> 1 1 a 5 1 #> 2 2 b 6 2 #> 3 3 c 7 3 +``` + +``` r y #> c d e id #> 1 6 f 100 2 #> 2 7 g 101 3 #> 3 8 h 102 4 +``` + +``` r data_merge(x, y, join = "full") #> a b c id d e @@ -227,32 +239,50 @@ data_merge(x, y, join = "full") #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 #> 4 NA 8 4 h 102 +``` + +``` r data_merge(x, y, join = "left") #> a b c id d e #> 3 1 a 5 1 NA #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 +``` + +``` r data_merge(x, y, join = "right") #> a b c id d e #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 #> 3 NA 8 4 h 102 +``` + +``` r data_merge(x, y, join = "semi", by = "c") #> a b c id #> 2 2 b 6 2 #> 3 3 c 7 3 +``` + +``` r data_merge(x, y, join = "anti", by = "c") #> a b c id #> 1 1 a 5 1 +``` + +``` r data_merge(x, y, join = "inner") #> a b c id d e #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 +``` + +``` r data_merge(x, y, join = "bind") #> a b c id d e @@ -323,13 +353,22 @@ tmp #> 3 3 3 NA 3 #> 4 NA NA NA NA #> 5 5 5 NA 5 +``` + +``` r # indices of empty columns or rows empty_columns(tmp) #> c #> 3 +``` + +``` r empty_rows(tmp) #> [1] 4 +``` + +``` r # remove empty columns or rows remove_empty_columns(tmp) @@ -339,12 +378,18 @@ remove_empty_columns(tmp) #> 3 3 3 3 #> 4 NA NA NA #> 5 5 5 5 +``` + +``` r remove_empty_rows(tmp) #> a b c d #> 1 1 1 NA 1 #> 2 2 NA NA NA #> 3 3 3 NA 3 #> 5 5 5 NA 5 +``` + +``` r # remove empty columns and rows remove_empty(tmp) @@ -365,6 +410,9 @@ table(x) #> x #> 1 2 3 4 5 6 7 8 9 10 #> 2 3 5 3 7 5 5 2 11 7 +``` + +``` r # cut into 3 groups, based on distribution (quantiles) table(categorize(x, split = "quantile", n_groups = 3)) @@ -398,6 +446,9 @@ summary(swiss) #> Mean : 41.144 Mean :19.94 #> 3rd Qu.: 93.125 3rd Qu.:21.70 #> Max. :100.000 Max. :26.60 +``` + +``` r # after summary(standardize(swiss)) @@ -436,6 +487,9 @@ anscombe #> 9 12 12 12 8 10.84 9.13 8.15 5.56 #> 10 7 7 7 8 4.82 7.26 6.42 7.91 #> 11 5 5 5 8 5.68 4.74 5.73 6.89 +``` + +``` r # after winsorize(anscombe) @@ -487,6 +541,9 @@ head(trees) #> 4 10.5 72 16.4 #> 5 10.7 81 18.8 #> 6 10.8 83 19.7 +``` + +``` r # after head(ranktransform(trees)) @@ -519,6 +576,9 @@ x #> Mazda RX4 21.0 6 160 110 #> Mazda RX4 Wag 21.0 6 160 110 #> Datsun 710 22.8 4 108 93 +``` + +``` r data_rotate(x) #> Mazda RX4 Mazda RX4 Wag Datsun 710 diff --git a/inst/WORDLIST b/inst/WORDLIST index 9d8f23406..a3dd80b42 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -83,6 +83,7 @@ partialization patilindrajeets platykurtic poorman +pre pth px readr diff --git a/man/data_read.Rd b/man/data_read.Rd index 6eda842ae..1ae3cea8a 100644 --- a/man/data_read.Rd +++ b/man/data_read.Rd @@ -123,7 +123,7 @@ their most appropriate type. The major difference to \code{rio::import()} is tha factors, where imported value labels will be set as factor levels. If a numeric variable has \emph{no} value labels or less value labels than values, it is not converted to factor. In this case, value labels are preserved as -\code{"labels"} attribute. Character vectors are preserved. Use +\code{"labels"} attribute. Character vectors are preserved. Use \code{convert_factors = FALSE} to remove the automatic conversion of numeric variables to factors. } diff --git a/man/data_restoretype.Rd b/man/data_restoretype.Rd index 39a745154..a0ddc5dd0 100644 --- a/man/data_restoretype.Rd +++ b/man/data_restoretype.Rd @@ -7,7 +7,7 @@ data_restoretype(data, reference = NULL, ...) } \arguments{ -\item{data}{A data frame to pivot.} +\item{data}{A data frame for which to restore the column types.} \item{reference}{A reference data frame from which to find the correct column types. If \code{NULL}, each column is converted to numeric if it doesn't diff --git a/man/data_to_long.Rd b/man/data_to_long.Rd index ea478c545..741725d25 100644 --- a/man/data_to_long.Rd +++ b/man/data_to_long.Rd @@ -38,7 +38,8 @@ reshape_longer( ) } \arguments{ -\item{data}{A data frame to pivot.} +\item{data}{A data frame to convert to long format, so that it has more +rows and fewer columns after the operation.} \item{select}{Variables that will be included when performing the required tasks. Can be either @@ -72,18 +73,22 @@ If \code{NULL}, selects all columns. Patterns that found no matches are silently ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} will just return \code{"Species"}.} -\item{names_to}{The name of the new column that will contain the column -names.} +\item{names_to}{The name of the new column (variable) that will contain the +\emph{names} from columns in \code{select} as values, to identify the source of the +values. \code{names_to} can be a character vector with more than one column name, +in which case \code{names_sep} or \code{names_pattern} must be provided in order to +identify which parts of the column names go into newly created columns. +See also 'Examples'.} \item{names_prefix}{A regular expression used to remove matching text from the start of each variable name.} \item{names_sep, names_pattern}{If \code{names_to} contains multiple values, this -argument controls how the column name is broken up. -\code{names_pattern} takes a regular expression containing matching groups, i.e. "()".} +argument controls how the column name is broken up. \code{names_pattern} takes a +regular expression containing matching groups, i.e. "()".} -\item{values_to}{The name of the new column that will contain the values of -the pivoted variables.} +\item{values_to}{The name of the new column that will contain the \emph{values} of +the columns in \code{select}.} \item{values_drop_na}{If \code{TRUE}, will drop rows that contain only \code{NA} in the \code{values_to} column. This effectively converts explicit missing values to @@ -121,19 +126,73 @@ This function "lengthens" data, increasing the number of rows and decreasing the number of columns. This is a dependency-free base-R equivalent of \code{tidyr::pivot_longer()}. } +\details{ +Reshaping data into long format usually means that the input data frame is +in \emph{wide} format, where multiple measurements taken on the same subject are +stored in multiple columns (variables). The long format stores the same +information in a single column, with each measurement per subject stored in +a separate row. The values of all variables that are not in \code{select} will +be repeated. + +The necessary information for \code{data_to_long()} is: +\itemize{ +\item The columns that contain the repeated measurements (\code{select}). +\item The name of the newly created column that will contain the names of the +columns in \code{select} (\code{names_to}), to identify the source of the values. +\code{names_to} can also be a character vector with more than one column name, +in which case \code{names_sep} or \code{names_pattern} must be provided to specify +which parts of the column names go into the newly created columns. +\item The name of the newly created column that contains the values of the +columns in \code{select} (\code{values_to}). +} + +In other words: repeated measurements that are spread across several columns +will be gathered into a single column (\code{values_to}), with the original column +names, that identify the source of the gathered values, stored in one or more +new columns (\code{names_to}). +} \examples{ \dontshow{if (requireNamespace("psych") && requireNamespace("tidyr")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} -wide_data <- data.frame(replicate(5, rnorm(10))) +wide_data <- setNames( + data.frame(replicate(2, rnorm(8))), + c("Time1", "Time2") +) +wide_data$ID <- 1:8 +wide_data -# Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:5)) +# Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:3)) +# probably doesn't make much sense to mix "time" and "id" data_to_long(wide_data) # Customizing the names -data_to_long(wide_data, - select = c(1, 2), - names_to = "Column", - values_to = "Numbers", - rows_to = "Row" +data_to_long( + wide_data, + select = c("Time1", "Time2"), + names_to = "Timepoint", + values_to = "Score" +) + +# Reshape multiple columns into long format. +mydat <- data.frame( + age = c(20, 30, 40), + sex = c("Female", "Male", "Male"), + score_t1 = c(30, 35, 32), + score_t2 = c(33, 34, 37), + score_t3 = c(36, 35, 38), + speed_t1 = c(2, 3, 1), + speed_t2 = c(3, 4, 5), + speed_t3 = c(1, 8, 6) +) +# The column names are split into two columns: "type" and "time". The +# pattern for splitting column names is provided in `names_pattern`. Values +# of all "score_*" and "speed_*" columns are gathered into a single column +# named "count". +data_to_long( + mydat, + select = 3:8, + names_to = c("type", "time"), + names_pattern = "(score|speed)_t(\\\\d+)", + values_to = "count" ) # Full example @@ -141,20 +200,22 @@ data_to_long(wide_data, data <- psych::bfi # Wide format with one row per participant's personality test # Pivot long format -data_to_long(data, +very_long_data <- data_to_long(data, select = regex("\\\\d"), # Select all columns that contain a digit names_to = "Item", values_to = "Score", rows_to = "Participant" ) +head(very_long_data) -data_to_long( +even_longer_data <- data_to_long( tidyr::who, select = new_sp_m014:newrel_f65, names_to = c("diagnosis", "gender", "age"), names_pattern = "new_?(.*)_(.)(.*)", values_to = "count" ) +head(even_longer_data) \dontshow{\}) # examplesIf} } \seealso{ diff --git a/man/data_to_wide.Rd b/man/data_to_wide.Rd index e0f36b7e6..8b781fc76 100644 --- a/man/data_to_wide.Rd +++ b/man/data_to_wide.Rd @@ -32,16 +32,25 @@ reshape_wider( ) } \arguments{ -\item{data}{A data frame to pivot.} - -\item{id_cols}{The name of the column that identifies the rows. If \code{NULL}, -it will use all the unique rows.} - -\item{values_from}{The name of the column that contains the values to be used -as future variable values.} - -\item{names_from}{The name of the column that contains the levels to be -used as future column names.} +\item{data}{A data frame to convert to wide format, so that it has more +columns and fewer rows post-widening than pre-widening.} + +\item{id_cols}{The name of the column that identifies the rows in the data +by which observations are grouped and the gathered data is spread into new +columns. Usually, this is a variable containing an ID for observations that +have been repeatedly measured. If \code{NULL}, it will use all remaining columns +that are not in \code{names_from} or \code{values_from} as ID columns. \code{id_cols} can +also be a character vector with more than one name of identifier columns. See +also 'Details' and 'Examples'.} + +\item{values_from}{The name of the columns in the original data that contains +the values used to fill the new columns created in the widened data.} + +\item{names_from}{The name of the column in the original data whose values +will be used for naming the new columns created in the widened data. Each +unique value in this column will become the name of one of these new columns. +In case \code{names_prefix} is provided, column names will be concatenated with +the string given in \code{names_prefix}.} \item{names_sep}{If \code{names_from} or \code{values_from} contains multiple variables, this will be used to join their values together into a single string to use @@ -64,7 +73,7 @@ missing values in the new columns created.} \item{...}{Not used for now.} } \value{ -If a tibble was provided as input, \code{reshape_wider()} also returns a +If a tibble was provided as input, \code{data_to_wide()} also returns a tibble. Otherwise, it returns a data frame. } \description{ @@ -72,7 +81,29 @@ This function "widens" data, increasing the number of columns and decreasing the number of rows. This is a dependency-free base-R equivalent of \code{tidyr::pivot_wider()}. } +\details{ +Reshaping data into wide format usually means that the input data frame is +in \emph{long} format, where multiple measurements taken on the same subject are +stored in multiple rows. The wide format stores the same information in a +single row, with each measurement stored in a separate column. Thus, the +necessary information for \code{data_to_wide()} is: +\itemize{ +\item The name of the column(s) that identify the groups or repeated measurements +(\code{id_cols}). +\item The name of the column whose \emph{values} will become the new column names +(\code{names_from}). Since these values may not necessarily reflect appropriate +column names, you can use \code{names_prefix} to add a prefix to each newly +created column name. +\item The name of the column that contains the values (\code{values_from}) for the +new columns that are created by \code{names_from}. +} + +In other words: repeated measurements, as indicated by \code{id_cols}, that are +saved into the column \code{values_from} will be spread into new columns, which +will be named after the values in \code{names_from}. See also 'Examples'. +} \examples{ +\dontshow{if (requireNamespace("lme4", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data_long <- read.table(header = TRUE, text = " subject sex condition measurement 1 M control 7.9 @@ -88,7 +119,7 @@ data_long <- read.table(header = TRUE, text = " 4 M cond1 13.4 4 M cond2 12.9") - +# converting long data into wide format data_to_wide( data_long, id_cols = "subject", @@ -96,6 +127,7 @@ data_to_wide( values_from = "measurement" ) +# converting long data into wide format with custom column names data_to_wide( data_long, id_cols = "subject", @@ -105,13 +137,13 @@ data_to_wide( names_sep = "." ) +# converting long data into wide format, combining multiple columns production <- expand.grid( product = c("A", "B"), country = c("AI", "EI"), year = 2000:2014 ) production <- data_filter(production, (product == "A" & country == "AI") | product == "B") - production$production <- rnorm(nrow(production)) data_to_wide( @@ -121,6 +153,58 @@ data_to_wide( names_glue = "prod_{product}_{country}" ) +# using the "sleepstudy" dataset +data(sleepstudy, package = "lme4") + +# the sleepstudy data contains repeated measurements of average reaction +# times for each subjects over multiple days, in a sleep deprivation study. +# It is in long-format, i.e. each row corresponds to a single measurement. +# The variable "Days" contains the timepoint of the measurement, and +# "Reaction" contains the measurement itself. Converting this data to wide +# format will create a new column for each day, with the reaction time as the +# value. +head(sleepstudy) + +data_to_wide( + sleepstudy, + id_cols = "Subject", + names_from = "Days", + values_from = "Reaction" +) + +# clearer column names +data_to_wide( + sleepstudy, + id_cols = "Subject", + names_from = "Days", + values_from = "Reaction", + names_prefix = "Reaction_Day_" +) + +# For unequal group sizes, missing information is filled with NA +d <- subset(sleepstudy, Days \%in\% c(0, 1, 2, 3, 4))[c(1:9, 11:13, 16:17, 21), ] + +# long format, different number of "Subjects" +d + +data_to_wide( + d, + id_cols = "Subject", + names_from = "Days", + values_from = "Reaction", + names_prefix = "Reaction_Day_" +) + +# filling missing values with 0 +data_to_wide( + d, + id_cols = "Subject", + names_from = "Days", + values_from = "Reaction", + names_prefix = "Reaction_Day_", + values_fill = 0 +) +\dontshow{\}) # examplesIf} } \seealso{ \itemize{ diff --git a/tests/testthat/_snaps/data_to_long.md b/tests/testthat/_snaps/data_to_long.md index cd7748df7..c863ccc02 100644 --- a/tests/testthat/_snaps/data_to_long.md +++ b/tests/testthat/_snaps/data_to_long.md @@ -11,3 +11,46 @@ $ Item : chr "A1" "A2" "A3" "A4" ... $ Score : int 2 4 3 4 4 2 3 3 4 4 ... +# don't convert factors to integer + + Code + print(mtcars_long) + Output + cyl hp drat wt vs am gear carb am_f cyl_f id g value + 1 4 93 3.85 2.320 1 1 4 1 1 4 3 mpg 22.80 + 2 4 93 3.85 2.320 1 1 4 1 1 4 3 qsec 18.61 + 3 4 93 3.85 2.320 1 1 4 1 1 4 3 disp 108.00 + 4 8 245 3.21 3.570 0 0 3 4 0 8 7 mpg 14.30 + 5 8 245 3.21 3.570 0 0 3 4 0 8 7 qsec 15.84 + 6 8 245 3.21 3.570 0 0 3 4 0 8 7 disp 360.00 + 7 4 66 4.08 2.200 1 1 4 1 1 4 10 mpg 32.40 + 8 4 66 4.08 2.200 1 1 4 1 1 4 10 qsec 19.47 + 9 4 66 4.08 2.200 1 1 4 1 1 4 10 disp 78.70 + 10 8 264 4.22 3.170 0 1 5 4 1 8 11 mpg 15.80 + 11 8 264 4.22 3.170 0 1 5 4 1 8 11 qsec 14.50 + 12 8 264 4.22 3.170 0 1 5 4 1 8 11 disp 351.00 + 13 6 110 3.08 3.215 1 0 3 1 0 6 4 mpg 21.40 + 14 6 110 3.08 3.215 1 0 3 1 0 6 4 qsec 19.44 + 15 6 110 3.08 3.215 1 0 3 1 0 6 4 disp 258.00 + 16 8 175 3.15 3.440 0 0 3 2 0 8 5 mpg 18.70 + 17 8 175 3.15 3.440 0 0 3 2 0 8 5 qsec 17.02 + 18 8 175 3.15 3.440 0 0 3 2 0 8 5 disp 360.00 + 19 8 335 3.54 3.570 0 1 5 8 1 8 12 mpg 15.00 + 20 8 335 3.54 3.570 0 1 5 8 1 8 12 qsec 14.60 + 21 8 335 3.54 3.570 0 1 5 8 1 8 12 disp 301.00 + 22 6 110 3.90 2.620 0 1 4 4 1 6 1 mpg 21.00 + 23 6 110 3.90 2.620 0 1 4 4 1 6 1 qsec 16.46 + 24 6 110 3.90 2.620 0 1 4 4 1 6 1 disp 160.00 + 25 6 110 3.90 2.875 0 1 4 4 1 6 2 mpg 21.00 + 26 6 110 3.90 2.875 0 1 4 4 1 6 2 qsec 17.02 + 27 6 110 3.90 2.875 0 1 4 4 1 6 2 disp 160.00 + 28 4 95 3.92 3.150 1 0 4 2 0 4 9 mpg 22.80 + 29 4 95 3.92 3.150 1 0 4 2 0 4 9 qsec 22.90 + 30 4 95 3.92 3.150 1 0 4 2 0 4 9 disp 140.80 + 31 4 62 3.69 3.190 1 0 4 2 0 4 8 mpg 24.40 + 32 4 62 3.69 3.190 1 0 4 2 0 4 8 qsec 20.00 + 33 4 62 3.69 3.190 1 0 4 2 0 4 8 disp 146.70 + 34 6 105 2.76 3.460 1 0 3 1 0 6 6 mpg 18.10 + 35 6 105 2.76 3.460 1 0 3 1 0 6 6 qsec 20.22 + 36 6 105 2.76 3.460 1 0 3 1 0 6 6 disp 225.00 + diff --git a/tests/testthat/test-data_to_long.R b/tests/testthat/test-data_to_long.R index 37d926b11..ab8bf1ba2 100644 --- a/tests/testthat/test-data_to_long.R +++ b/tests/testthat/test-data_to_long.R @@ -488,3 +488,17 @@ test_that("works with labelled data", { expect_identical(nrow(out), 200L) expect_identical(attributes(out$e42dep)$label, "elder's dependency") }) + + +test_that("don't convert factors to integer", { + data("mtcars") + mtcars <- mtcars[c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 18L, 29L, 31L), ] + mtcars$am_f <- factor(mtcars$am) + mtcars$cyl_f <- factor(mtcars$cyl) + + mtcars$id <- factor(seq_len(nrow(mtcars))) + mtcars_long <- data_to_long(mtcars, + select = c("mpg", "qsec", "disp"), names_to = "g" + ) + expect_snapshot(print(mtcars_long)) +}) diff --git a/tests/testthat/test-data_to_wide.R b/tests/testthat/test-data_to_wide.R index 88fbb699e..d716ff222 100644 --- a/tests/testthat/test-data_to_wide.R +++ b/tests/testthat/test-data_to_wide.R @@ -35,7 +35,7 @@ test_that("data_to_wide works", { values_from = "value", id_cols = "Row_ID" ), - regexp = "Some values of the columns specified in 'names_from'" + regexp = "Some values of the columns specified in `names_from`" ) }) @@ -44,12 +44,12 @@ test_that("data_to_wide works", { test_that("data_to_wide, names_prefix works", { skip_if_not_installed("tidyr") - out <- tidyr::fish_encounters %>% - data_to_wide( - names_from = "station", - values_from = "seen", - names_prefix = "foo_" - ) + out <- data_to_wide( + tidyr::fish_encounters, + names_from = "station", + values_from = "seen", + names_prefix = "foo_" + ) expect_named( out, @@ -67,12 +67,12 @@ test_that("data_to_wide, values_fill works", { ### Should be numeric expect_identical( - data %>% - data_to_wide( - names_from = "station", - values_from = "seen", - values_fill = 1 - ), + data_to_wide( + data, + names_from = "station", + values_from = "seen", + values_fill = 1 + ), tidyr::tibble( fish = factor( c("4842", "4843", "4844"), @@ -91,21 +91,21 @@ test_that("data_to_wide, values_fill works", { ) ) expect_error( - data %>% - data_to_wide( - names_from = "station", - values_from = "seen", - values_fill = "a" - ), + data_to_wide( + data, + names_from = "station", + values_from = "seen", + values_fill = "a" + ), regexp = "must be of type numeric" ) expect_error( - data %>% - data_to_wide( - names_from = "station", - values_from = "seen", - values_fill = factor("a") - ), + data_to_wide( + data, + names_from = "station", + values_from = "seen", + values_fill = factor("a") + ), regexp = "must be of type numeric" ) @@ -120,12 +120,12 @@ test_that("data_to_wide, values_fill works", { contacts$person_id <- cumsum(contacts$field == "name") expect_identical( - contacts %>% - data_to_wide( - names_from = "field", - values_from = "value", - values_fill = "foo" - ), + data_to_wide( + contacts, + names_from = "field", + values_from = "value", + values_fill = "foo" + ), tidyr::tibble( person_id = 1:3, name = c("Jiena McLellan", "John Smith", "Huxley Ratcliffe"), @@ -133,42 +133,42 @@ test_that("data_to_wide, values_fill works", { ) ) expect_error( - contacts %>% - data_to_wide( - names_from = "field", - values_from = "value", - values_fill = 1 - ), + data_to_wide( + contacts, + names_from = "field", + values_from = "value", + values_fill = 1 + ), regexp = "must be of type character" ) expect_error( - contacts %>% - data_to_wide( - names_from = "field", - values_from = "value", - values_fill = factor("a") - ), + data_to_wide( + contacts, + names_from = "field", + values_from = "value", + values_fill = factor("a") + ), regexp = "must be of type character" ) ### Should be factor contacts$value <- as.factor(contacts$value) expect_error( - contacts %>% - data_to_wide( - names_from = "field", - values_from = "value", - values_fill = "a" - ), + data_to_wide( + contacts, + names_from = "field", + values_from = "value", + values_fill = "a" + ), regexp = "must be of type factor" ) expect_error( - contacts %>% - data_to_wide( - names_from = "field", - values_from = "value", - values_fill = 1 - ), + data_to_wide( + contacts, + names_from = "field", + values_from = "value", + values_fill = 1 + ), regexp = "must be of type factor" ) }) @@ -177,12 +177,12 @@ test_that("data_to_wide, values_fill errors when length > 1", { skip_if_not_installed("tidyr") expect_error( - tidyr::fish_encounters %>% - data_to_wide( - names_from = "station", - values_from = "seen", - values_fill = c(1, 2) - ), + data_to_wide( + tidyr::fish_encounters, + names_from = "station", + values_from = "seen", + values_fill = c(1, 2) + ), regexp = "must be of length 1" ) }) @@ -302,15 +302,19 @@ test_that("data_to_wide, id_cols works correctly, #293", { test_that("data_to_wide equivalent to pivot_wider: ex 1", { skip_if_not_installed("tidyr") - x <- tidyr::fish_encounters %>% - tidyr::pivot_wider(names_from = "station", values_from = "seen", values_fill = 0) + x <- tidyr::pivot_wider( + tidyr::fish_encounters, + names_from = "station", + values_from = "seen", + values_fill = 0 + ) - y <- tidyr::fish_encounters %>% - data_to_wide( - names_from = "station", - values_from = "seen", - values_fill = 0 - ) + y <- data_to_wide( + tidyr::fish_encounters, + names_from = "station", + values_from = "seen", + values_fill = 0 + ) expect_equal(x, y, ignore_attr = TRUE) }) @@ -394,11 +398,8 @@ test_that("data_to_wide equivalent to pivot_wider: ex 5", { ) contacts$person_id <- cumsum(contacts$field == "name") - x <- contacts %>% - tidyr::pivot_wider(names_from = field, values_from = value) - - y <- contacts %>% - data_to_wide(names_from = "field", values_from = "value") + x <- tidyr::pivot_wider(contacts, names_from = field, values_from = value) + y <- data_to_wide(contacts, names_from = "field", values_from = "value") expect_identical(x, y) })