diff --git a/.Rbuildignore b/.Rbuildignore index 7791a8d..65fea1a 100755 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,4 +2,6 @@ ^\.Rproj\.user$ ^\.travis ^README\.Rmd$ -^\.git$ \ No newline at end of file +^\.git$ +^\.github$ +^cran-comments\.md$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml new file mode 100644 index 0000000..ce702bd --- /dev/null +++ b/.github/workflows/R-CMD-check.yml @@ -0,0 +1,83 @@ +on: + push: + branches: + - master + - devel + pull_request: + branches: + - master + schedule: + - cron: "0 0 * * 1" + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + if: "!contains(github.event.head_commit.message, 'ci skip')" + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + #- {os: windows-latest, r: 'release'} + #- {os: macOS-latest, r: 'release'} + - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} + - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@master + with: + r-version: ${{ matrix.config.r }} + + - uses: r-lib/actions/setup-pandoc@master + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + if: runner.os != 'Windows' + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install system dependencies + if: runner.os == 'Linux' + run: | + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + shell: Rscript {0} + + - name: Check + env: + _R_CHECK_CRAN_INCOMING_REMOTE_: false + run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") + shell: Rscript {0} + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@master + with: + name: ${{ runner.os }}-r${{ matrix.config.r }}-results + path: check + diff --git a/.github/workflows/test_coverage.yml b/.github/workflows/test_coverage.yml new file mode 100644 index 0000000..6ee47ce --- /dev/null +++ b/.github/workflows/test_coverage.yml @@ -0,0 +1,54 @@ +on: + push: + branches: + - master + - devel + pull_request: + branches: + - master + +name: test-coverage + +jobs: + test-coverage: + runs-on: ubuntu-latest + if: "!contains(github.event.head_commit.message, 'ci skip')" + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@master + + - uses: r-lib/actions/setup-pandoc@master + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), "depends.Rds", version = 2) + shell: Rscript {0} + + - name: Cache R packages + uses: actions/cache@v1 + with: + path: ${{ env.R_LIBS_USER }} + key: ubuntu-r-release-${{ hashFiles('depends.Rds') }} + restore-keys: ubuntu-r-release- + + - name: Install system dependencies + if: runner.os == 'Linux' + run: | + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') + + - name: Install dependencies + run: | + install.packages(c("remotes")) + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("covr") + shell: Rscript {0} + + - name: Test coverage + run: covr::codecov() + shell: Rscript {0} + diff --git a/DESCRIPTION b/DESCRIPTION index d89ff55..d554431 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,14 +1,22 @@ Package: dynfeature Type: Package -Title: Dynamic feature importance -Version: 1.0.0.9000 -Authors@R: c( - person("Robrecht", "Cannoodt", email = "rcannood@gmail.com", role = c("aut")), - person("Wouter", "Saelens", email = "wouter.saelens@ugent.be", role = c("aut", "cre"))) -Description: Calculating feature importance scores from trajectories +Title: Feature Importance for Dynamic Processes +Version: 1.0.0 +Authors@R: + c(person(given = "Robrecht", + family = "Cannoodt", + role = c("aut", "cre"), + email = "rcannood@gmail.com", + comment = c(ORCID = "0000-0003-3641-729X", github = "rcannood")), + person(given = "Wouter", + family = "Saelens", + role = "aut", + email = "wouter.saelens@gmail.com", + comment = c(ORCID = "0000-0002-7114-6248", github = "zouter"))) +Description: Calculating feature importance scores from trajectories using the random forests algorithm and more. Saelens and Cannoodt et + al. (2019) . License: GPL-3 Encoding: UTF-8 -LazyData: true Imports: dplyr, dynutils (>= 1.0.2), @@ -22,19 +30,7 @@ Imports: tidyr, tibble Suggests: - caret -RoxygenNote: 6.1.1 + caret, + covr +RoxygenNote: 7.1.1 Roxygen: list(markdown = TRUE) -Remotes: - dynverse/dynutils@devel, - dynverse/dynwrap@devel -Collate: - 'calculate_feature_importances.R' - 'calculate_branch_feature_importance.R' - 'calculate_branching_point_feature_importance.R' - 'calculate_cell_feature_importance.R' - 'calculate_milestone_feature_importance.R' - 'calculate_overall_feature_importance.R' - 'calculate_waypoint_feature_importance.R' - 'fi_methods.R' - 'package.R' diff --git a/NAMESPACE b/NAMESPACE index dbba772..8d78b31 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,11 +9,11 @@ export(calculate_waypoint_feature_importance) export(fi_caret) export(fi_ranger_rf) export(fi_ranger_rf_lite) +export(fi_ranger_rf_tiny) import(dplyr) import(dynutils) import(dynwrap) import(methods) -import(tibble) import(tidyr) importFrom(purrr,"%>%") importFrom(purrr,invoke) @@ -28,3 +28,6 @@ importFrom(purrr,map_lgl) importFrom(purrr,set_names) importFrom(ranger,ranger) importFrom(reshape2,acast) +importFrom(tibble,as_tibble) +importFrom(tibble,tibble) +importFrom(tibble,tribble) diff --git a/inst/NEWS.md b/NEWS.md similarity index 89% rename from inst/NEWS.md rename to NEWS.md index 5c5858a..f7f58e8 100644 --- a/inst/NEWS.md +++ b/NEWS.md @@ -1,8 +1,12 @@ -# dynfeature 1.0.0 (28-03-2019) +# dynfeature 1.0.0 + +Initial release of dynfeature on CRAN. * MINOR CHANGE: Use only one core by default. -* MINOR CHANGE: Support sparse matrices +* MINOR CHANGE: Support sparse matrices. + +* DOCUMENTATION: Add examples and returns. # dynfeature 0.2.0 (25-10-2018) diff --git a/R/calculate_branch_feature_importance.R b/R/calculate_branch_feature_importance.R index 0a07c6f..008e21d 100644 --- a/R/calculate_branch_feature_importance.R +++ b/R/calculate_branch_feature_importance.R @@ -1,52 +1,46 @@ #' @rdname calculate_overall_feature_importance #' #' @export -#' -#' @include calculate_feature_importances.R -calculate_branch_feature_importance <- inherit_default_params( - super_functions = list(calculate_feature_importances), - fun = function( - trajectory, - expression_source = "expression", - fi_method, - verbose - ) { - # assign name to each edge - milestone_network <- - trajectory$milestone_network %>% - mutate(edge_id = as.character(row_number())) %>% - select(from, to, edge_id) +calculate_branch_feature_importance <- function( + trajectory, + expression_source = "expression", + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) { + # assign name to each edge + milestone_network <- + trajectory$milestone_network %>% + mutate(edge_id = as.character(row_number())) %>% + select(.data$from, .data$to, .data$edge_id) - # determine which cell is part of which edge - edge_membership <- - trajectory$progressions %>% - group_by(cell_id) %>% - top_n(1, percentage) %>% - ungroup() %>% - left_join(milestone_network, c("from", "to")) %>% - reshape2::acast(cell_id ~ edge_id, value.var = "percentage") %>% - {!is.na(.)} + # determine which cell is part of which edge + edge_membership <- + trajectory$progressions %>% + group_by(.data$cell_id) %>% + top_n(1, .data$percentage) %>% + ungroup() %>% + left_join(milestone_network, c("from", "to")) %>% + mutate(contains = TRUE) %>% + reshape2::acast(cell_id ~ edge_id, value.var = "contains", fill = FALSE) - expression <- get_expression(trajectory, expression_source) + expression <- get_expression(trajectory, expression_source) + out <- calculate_feature_importances( + X = expression, + Y = edge_membership, + fi_method = fi_method, + verbose = verbose + ) + suppressWarnings({ + out <- out %>% + left_join(milestone_network, c("predictor_id" = "edge_id")) + }) - out <- calculate_feature_importances( - X = expression, - Y = edge_membership, - fi_method = fi_method, - verbose = verbose + out %>% + transmute( + .data$feature_id, + from = factor(.data$from, trajectory$milestone_ids), + to = factor(.data$to, trajectory$milestone_ids), + .data$importance ) - suppressWarnings({ - out <- out %>% - left_join(milestone_network, c("predictor_id" = "edge_id")) - }) - - out %>% - transmute( - feature_id, - from = factor(from, trajectory$milestone_ids), - to = factor(to, trajectory$milestone_ids), - importance - ) - } -) \ No newline at end of file +} \ No newline at end of file diff --git a/R/calculate_branching_point_feature_importance.R b/R/calculate_branching_point_feature_importance.R index 06833cc..b55b152 100644 --- a/R/calculate_branching_point_feature_importance.R +++ b/R/calculate_branching_point_feature_importance.R @@ -1,70 +1,65 @@ #' @rdname calculate_overall_feature_importance #' #' @export -#' -#' @include calculate_feature_importances.R -calculate_branching_point_feature_importance <- inherit_default_params( - super_functions = list(calculate_feature_importances), - fun = function( - trajectory, - expression_source = "expression", - milestones_oi = trajectory$milestone_ids, - fi_method, - verbose - ) { - # assign name to each edge - milestone_network <- - trajectory$milestone_network %>% - mutate(edge_id = as.character(row_number())) %>% - select(from, to, edge_id) +calculate_branching_point_feature_importance <- function( + trajectory, + expression_source = "expression", + milestones_oi = trajectory$milestone_ids, + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) { + # assign name to each edge + milestone_network <- + trajectory$milestone_network %>% + mutate(edge_id = as.character(row_number())) %>% + select(.data$from, .data$to, .data$edge_id) - # determine which cell is part of which edge - edge_membership <- trajectory$progressions %>% - group_by(cell_id) %>% - top_n(1, percentage) %>% - ungroup() %>% - left_join(milestone_network, c("from", "to")) %>% - reshape2::acast(cell_id~edge_id, value.var="percentage") %>% - {!is.na(.)} + # determine which cell is part of which edge + edge_membership <- trajectory$progressions %>% + group_by(.data$cell_id) %>% + top_n(1, .data$percentage) %>% + ungroup() %>% + left_join(milestone_network, c("from", "to")) %>% + mutate(contains = TRUE) %>% + reshape2::acast(cell_id~edge_id, value.var = "contains", fill = FALSE) - expression <- get_expression(trajectory, expression_source) + expression <- get_expression(trajectory, expression_source) - map_df( - seq_along(milestones_oi), - function(i) { - if (verbose) cat("Processing milestone ", i , "/", length(milestones_oi), "\n", sep = "") + map_df( + seq_along(milestones_oi), + function(i) { + if (verbose) cat("Processing milestone ", i , "/", length(milestones_oi), "\n", sep = "") - milestone_oi <- milestones_oi[[i]] + milestone_oi <- milestones_oi[[i]] - # select the cells which are close to the milestone - prog <- - trajectory$progressions %>% - filter(from == milestone_oi | to == milestone_oi) %>% - mutate(milestone_other = ifelse(from == milestone_oi, to, from)) %>% - group_by(cell_id) %>% - top_n(1, percentage) %>% - ungroup() + # select the cells which are close to the milestone + prog <- + trajectory$progressions %>% + filter(.data$from == milestone_oi | .data$to == milestone_oi) %>% + mutate(milestone_other = ifelse(.data$from == milestone_oi, .data$to, .data$from)) %>% + group_by(.data$cell_id) %>% + top_n(1, .data$percentage) %>% + ungroup() - expression_oi <- expression[prog$cell_id, ] - outcome <- as.character(prog$milestone_other) + expression_oi <- expression[prog$cell_id, ] + outcome <- as.character(prog$milestone_other) - if(length(unique(prog$milestone_other)) < 2) { - warning("Could not find features specific for milestone ", milestone_oi) - tibble() - } else { - calculate_feature_importances( - X = expression_oi, - Y = outcome, - fi_method = fi_method, - verbose = verbose - ) %>% - transmute( - milestone_id = factor(milestone_oi, levels = trajectory$milestone_ids), - feature_id, - importance - ) - } + if(length(unique(prog$milestone_other)) < 2) { + warning("Could not find features specific for milestone ", milestone_oi) + tibble() + } else { + calculate_feature_importances( + X = expression_oi, + Y = outcome, + fi_method = fi_method, + verbose = verbose + ) %>% + transmute( + milestone_id = factor(milestone_oi, levels = trajectory$milestone_ids), + .data$feature_id, + .data$importance + ) } - ) - } -) \ No newline at end of file + } + ) +} \ No newline at end of file diff --git a/R/calculate_cell_feature_importance.R b/R/calculate_cell_feature_importance.R index 10ceefc..3679029 100644 --- a/R/calculate_cell_feature_importance.R +++ b/R/calculate_cell_feature_importance.R @@ -1,44 +1,38 @@ #' @rdname calculate_overall_feature_importance #' #' @export -#' -#' @include calculate_feature_importances.R -calculate_cell_feature_importance <- inherit_default_params( - super_functions = list(calculate_feature_importances), - fun = function( - trajectory, - expression_source = "expression", - fi_method, - verbose - ) { - if (!is_wrapper_with_waypoints(trajectory)) { - trajectory <- trajectory %>% dynwrap::add_waypoints() - } +calculate_cell_feature_importance <- function( + trajectory, + expression_source = "expression", + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) { + if (!is_wrapper_with_waypoints(trajectory)) { + trajectory <- trajectory %>% dynwrap::add_waypoints() + } - waypoints <- trajectory$waypoints + waypoints <- trajectory$waypoints - waypoint_feature_importances <- - calculate_waypoint_feature_importance( - trajectory = trajectory, - expression_source = expression_source, - waypoints = waypoints, - fi_method = fi_method, - verbose = verbose - ) + waypoint_feature_importances <- + calculate_waypoint_feature_importance( + trajectory = trajectory, + expression_source = expression_source, + waypoints = waypoints, + fi_method = fi_method, + verbose = verbose + ) - closest_waypoints <- - trajectory$waypoints$geodesic_distances %>% { - tibble( - cell_id = factor(colnames(.), levels = colnames(.)), - waypoint_id = factor(rownames(.)[apply(., 2, which.min)], levels = waypoints$waypoints$waypoint_id %>% sort) - ) - } + geo <- trajectory$waypoints$geodesic_distances - full_join( - closest_waypoints, - waypoint_feature_importances, - by = "waypoint_id" - ) %>% - select(-waypoint_id) - } -) \ No newline at end of file + closest_waypoints <- tibble( + cell_id = factor(colnames(geo), levels = colnames(geo)), + waypoint_id = factor(rownames(geo)[apply(geo, 2, which.min)], levels = waypoints$waypoints$waypoint_id %>% sort) + ) + + full_join( + closest_waypoints, + waypoint_feature_importances, + by = "waypoint_id" + ) %>% + select(-.data$waypoint_id) +} \ No newline at end of file diff --git a/R/calculate_feature_importances.R b/R/calculate_feature_importances.R index 9836c56..5123a7c 100644 --- a/R/calculate_feature_importances.R +++ b/R/calculate_feature_importances.R @@ -4,14 +4,28 @@ #' @param Y A data frame of predictor variables, with `nrow(Y) == nrow(X)`. #' @param fi_method A feature importance method. Default: `fi_ranger_rf_lite()`. Check `?fi_methods` for a full list of available feature importance methods. #' @param verbose Whether to print out extra information. -calculate_feature_importances <- function(X, Y, fi_method = fi_ranger_rf_lite(), verbose = FALSE) { +#' +#' @returns A data frame with three columns, `predictor_id`, `feature_id`, and `importance`. `predictor_id` is a column in `Y`, while `feature_id` is a column in `X`. +#' +#' @examples +#' X <- data.frame(matrix(runif(25*10), ncol = 10)) +#' Y <- data.frame(matrix(runif(25*2), ncol = 2)) +#' +#' # don't run since this function is not exported +#' # calculate_feature_importances(X, Y) +calculate_feature_importances <- function( + X, + Y, + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) { # if Y is a vector or a matrix, turn it into a data frame if (!is.data.frame(Y)) { # convert to regular matrix if sparse if (dynutils::is_sparse(Y)) { Y <- as.matrix(Y) } - Y <- as_data_frame(Y) + Y <- as_tibble(Y, .name_repair = "minimal") } # convert expression to regular matrix if sparse @@ -36,7 +50,7 @@ calculate_feature_importances <- function(X, Y, fi_method = fi_ranger_rf_lite(), importance <- fi_method$fun(X, y, verbose = verbose) } - data_frame( + tibble( predictor_id = factor(colnames(Y)[[i]], levels = colnames(Y)), feature_id = factor(names(importance), levels = names(importance)), importance @@ -45,5 +59,5 @@ calculate_feature_importances <- function(X, Y, fi_method = fi_ranger_rf_lite(), # return importances ordered by value importances %>% - arrange(desc(importance)) + arrange(desc(.data$importance)) } \ No newline at end of file diff --git a/R/calculate_milestone_feature_importance.R b/R/calculate_milestone_feature_importance.R index d362bdb..8c2f178 100644 --- a/R/calculate_milestone_feature_importance.R +++ b/R/calculate_milestone_feature_importance.R @@ -1,50 +1,45 @@ #' @rdname calculate_overall_feature_importance #' #' @export -#' -#' @include calculate_feature_importances.R -calculate_milestone_feature_importance <- inherit_default_params( - super_functions = list(calculate_feature_importances), - fun = function( - trajectory, - expression_source = "expression", - milestones_oi = NULL, - fi_method, - verbose - ) { - # get expression from trajectory source or expression source - expression <- get_expression(trajectory, expression_source) +calculate_milestone_feature_importance <- function( + trajectory, + expression_source = "expression", + milestones_oi = NULL, + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) { + # get expression from trajectory source or expression source + expression <- get_expression(trajectory, expression_source) - # check trajectory and extract some variables - testthat::expect_true(dynwrap::is_wrapper_with_trajectory(trajectory)) - milestone_percentages <- trajectory$milestone_percentages - cell_ids <- trajectory$cell_ids + # check trajectory and extract some variables + testthat::expect_true(dynwrap::is_wrapper_with_trajectory(trajectory)) + milestone_percentages <- trajectory$milestone_percentages + cell_ids <- trajectory$cell_ids - testthat::expect_true(all(cell_ids %in% rownames(expression))) - testthat::expect_true(length(cell_ids) >= 3, info = "Need 3 or more cells in a trajectory to determine important features") + testthat::expect_true(all(cell_ids %in% rownames(expression))) + testthat::expect_true(length(cell_ids) >= 3, info = "Need 3 or more cells in a trajectory to determine important features") - # process milestones - if (is.null(milestones_oi)) { - milestones_oi <- trajectory$milestone_ids - } + # process milestones + if (is.null(milestones_oi)) { + milestones_oi <- trajectory$milestone_ids + } - # construct milestone percentages matrix: - # * only retain relevant milestones - # * transform from long format to matrix - # * if cells were filtered out, add them back to the matrix - milenet_m <- - milestone_percentages %>% - filter(milestone_id %in% milestones_oi) %>% - reshape2::acast(cell_id ~ milestone_id, value.var = "percentage", fill = 0) %>% - expand_matrix(rownames = cell_ids) + # construct milestone percentages matrix: + # * only retain relevant milestones + # * transform from long format to matrix + # * if cells were filtered out, add them back to the matrix + milenet_m <- + milestone_percentages %>% + filter(.data$milestone_id %in% milestones_oi) %>% + reshape2::acast(cell_id ~ milestone_id, value.var = "percentage", fill = 0) %>% + expand_matrix(rownames = cell_ids) - # calculate feature importance scores - calculate_feature_importances( - X = expression, - Y = milenet_m, - fi_method = fi_method, - verbose = verbose - ) %>% - rename(milestone_id = predictor_id) - } -) \ No newline at end of file + # calculate feature importance scores + calculate_feature_importances( + X = expression, + Y = milenet_m, + fi_method = fi_method, + verbose = verbose + ) %>% + rename(milestone_id = .data$predictor_id) +} \ No newline at end of file diff --git a/R/calculate_overall_feature_importance.R b/R/calculate_overall_feature_importance.R index baa0e90..77f2fb4 100644 --- a/R/calculate_overall_feature_importance.R +++ b/R/calculate_overall_feature_importance.R @@ -12,6 +12,8 @@ #' @param milestones_oi The milestone(s) for which to calculate feature importance #' @param waypoints The waypoints, optional #' +#' @returns A data frame with two or more columns, `feature_id`, and `importance`. `feature_id` is a column in the trajectory expression matrix. Additional columns may be available depending on the function called. +#' #' @inheritParams calculate_feature_importances #' #' @importFrom reshape2 acast @@ -19,23 +21,24 @@ #' #' @export #' -#' @include calculate_feature_importances.R -calculate_overall_feature_importance <- inherit_default_params( - super_functions = list(calculate_feature_importances), - fun = function( - trajectory, - expression_source = "expression", - fi_method, - verbose - ) { - calculate_milestone_feature_importance( - trajectory = trajectory, - expression_source = expression_source, - fi_method = fi_method, - verbose = verbose - ) %>% - group_by(feature_id) %>% - summarise(importance = mean(importance)) %>% - arrange(desc(importance)) - } -) \ No newline at end of file +#' @examples +#' library(dynwrap) +#' data(example_trajectory) +#' +#' calculate_overall_feature_importance(example_trajectory) +calculate_overall_feature_importance <- function( + trajectory, + expression_source = "expression", + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) { + calculate_milestone_feature_importance( + trajectory = trajectory, + expression_source = expression_source, + fi_method = fi_method, + verbose = verbose + ) %>% + group_by(.data$feature_id) %>% + summarise(importance = mean(.data$importance)) %>% + arrange(desc(.data$importance)) +} \ No newline at end of file diff --git a/R/calculate_waypoint_feature_importance.R b/R/calculate_waypoint_feature_importance.R index 02d321f..a5463a5 100644 --- a/R/calculate_waypoint_feature_importance.R +++ b/R/calculate_waypoint_feature_importance.R @@ -1,34 +1,29 @@ #' @rdname calculate_overall_feature_importance #' #' @export -#' -#' @include calculate_feature_importances.R -calculate_waypoint_feature_importance <- inherit_default_params( - super_functions = list(calculate_feature_importances), - fun = function( - trajectory, - expression_source = "expression", - waypoints = NULL, - fi_method, - verbose - ) { - if (is.null(waypoints)) { - if (!is_wrapper_with_waypoints(trajectory)) { - message("Adding waypoints to prediction") - trajectory <- trajectory %>% dynwrap::add_waypoints() - } - - waypoints <- trajectory$waypoints +calculate_waypoint_feature_importance <- function( + trajectory, + expression_source = "expression", + waypoints = NULL, + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) { + if (is.null(waypoints)) { + if (!is_wrapper_with_waypoints(trajectory)) { + message("Adding waypoints to prediction") + trajectory <- trajectory %>% dynwrap::add_waypoints() } - expression <- get_expression(trajectory, expression_source) - - calculate_feature_importances( - X = expression, - Y = t(waypoints$geodesic_distances)[rownames(expression),], - fi_method = fi_method, - verbose = verbose - ) %>% - rename(waypoint_id = predictor_id) + waypoints <- trajectory$waypoints } -) \ No newline at end of file + + expression <- get_expression(trajectory, expression_source) + + calculate_feature_importances( + X = expression, + Y = t(waypoints$geodesic_distances)[rownames(expression),unique(rownames(waypoints$geodesic_distances))], + fi_method = fi_method, + verbose = verbose + ) %>% + rename(waypoint_id = .data$predictor_id) +} \ No newline at end of file diff --git a/R/fi_methods.R b/R/fi_methods.R index ad56084..c3a6701 100644 --- a/R/fi_methods.R +++ b/R/fi_methods.R @@ -14,11 +14,19 @@ apply_function_params <- function(params, nrow, ncol) { #' @param num_variables_per_split (fi_ranger_rf_lite) The number of variables to sample per split #' @param num_samples_per_tree (fi_ranger_rf_lite) The number of samples to bootstrap per split #' @param min_node_size (fi_ranger_rf_lite) The minimum node size, no split will be made if the node size is less than this value. -#' @param ... Extra parameters to pass onto the underlying feature importnce function. +#' @param ... Extra parameters to pass onto the underlying feature importance function. +#' +#' @returns A list containing a helper function for calling a feature importance function. #' #' @rdname fi_methods #' #' @export +#' +#' @examples +#' library(dynwrap) +#' data(example_trajectory) +#' +#' calculate_overall_feature_importance(example_trajectory, fi_method = fi_ranger_rf()) fi_ranger_rf_lite <- function( num_trees = 2000, num_variables_per_split = 50, @@ -116,7 +124,9 @@ fi_caret <- function( model <- do.call(caret::train, method_params) - caret::varImp(model)[[1]] %>% {set_names(.[, 1], rownames(.))} + vi <- caret::varImp(model)[[1]] + + set_names(vi[, 1], rownames(vi)) } ) } diff --git a/R/package.R b/R/package.R index be0c517..2958831 100755 --- a/R/package.R +++ b/R/package.R @@ -1,9 +1,11 @@ -#' Dynfeature feature importance +#' Feature Importance for Dynamic Processes +#' +#' Calculating feature importance scores from trajectories using the random forests algorithm. #' #' @import dplyr #' @import tidyr #' @import methods -#' @import tibble +#' @importFrom tibble tibble tribble as_tibble #' @import dynutils #' @import dynwrap #' @importFrom purrr %>% map map_df map_chr map_lgl map_int map_dbl keep set_names list_modify invoke diff --git a/README.Rmd b/README.Rmd index d5d9f5b..c302767 100755 --- a/README.Rmd +++ b/README.Rmd @@ -23,12 +23,11 @@ The plotting of the top features is nicely intergrated into [dynplot](https://gi ## Latest changes -Check out `news(package = "dynwrap")` or [NEWS.md](inst/NEWS.md) for a full list of changes. +Check out `news(package = "dynwrap")` or [NEWS.md](NEWS.md) for a full list of changes. - + ```{r news, echo=FALSE, results="asis"} -dynutils::update_news() cat(dynutils::recent_news()) ``` diff --git a/README.md b/README.md index a6b2a52..f21ae7e 100755 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ heatmap](https://raw.githubusercontent.com/dynverse/dynplot/devel/.readme_files/ ## Latest changes -Check out `news(package = "dynwrap")` or [NEWS.md](inst/NEWS.md) for a +Check out `news(package = "dynwrap")` or [NEWS.md](NEWS.md) for a full list of changes. diff --git a/cran-comments.md b/cran-comments.md new file mode 100644 index 0000000..093ddf6 --- /dev/null +++ b/cran-comments.md @@ -0,0 +1,26 @@ +# dynfeature 1.0.0 + +Initial release of dynfeature on CRAN. + +* MINOR CHANGE: Use only one core by default. + +* MINOR CHANGE: Support sparse matrices. + +* DOCUMENTATION: Add examples and returns. + +## Test environments +* local Fedora install, R 4.0 +* ubuntu 20.04, mac os x, windows (on github actions), R 4.0 +* win-builder (devel and release) + +## R CMD check results + +``` +── R CMD check results ─────────────────────────────────── dynfeature 1.0.0 ──── +Duration: 3m 0.1s + +0 errors ✓ | 0 warnings ✓ | 0 notes ✓ + +R CMD check succeeded +``` + diff --git a/inst/NEWS b/inst/NEWS deleted file mode 100644 index 5cd73e6..0000000 --- a/inst/NEWS +++ /dev/null @@ -1,31 +0,0 @@ -dynfeature 1.0.0 (28-03-2019) - -* MINOR CHANGE: Use only one core by default. - -* MINOR CHANGE: Support sparse matrices - -dynfeature 0.2.0 (25-10-2018) - -* SPEED UP: Added `fi_ranger_rf_lite()`, which scales much better w.r.t. the number of samples and features, - at the cost of increasing loss of accuracy at higher dimension sizes. - -* MAJOR CHANGES: Large cleanup of the code. Most notably, - - The format of feature importance method specification and its parameters, - with format `fi_method = fi_example_method(param1 = 10, param2 = 4)`. - Before, it had to be specified as `method = "example_method", method_params = list(param1 = 10, param2 = 4)`. - -* MINOR CHANGE: Whenever possible, output columns are now factors instead of characters. - -* MINOR CHANGE: Add NEWS, and add news section to README. - -* DOCUMENTATION: Turned on markdown for Roxygen. - -* DOCUMENTATION: Improved documentation on expression_source. - -* TESTING: Improved testing with a larger dataset, and will check whether the overall feature importance produces decent results. - -* MINOR CHANGE: Feature importance functions will always return factors instead of characters. - -dynfeature 0.1.0 (26-04-2018) - -* INITIAL RELEASE: dynfeature, calculating feature importance scores from trajectories. diff --git a/man/calculate_feature_importances.Rd b/man/calculate_feature_importances.Rd index a6f984d..ad201ed 100644 --- a/man/calculate_feature_importances.Rd +++ b/man/calculate_feature_importances.Rd @@ -4,8 +4,12 @@ \alias{calculate_feature_importances} \title{Calculate feature importance scores} \usage{ -calculate_feature_importances(X, Y, fi_method = fi_ranger_rf_lite(), - verbose = FALSE) +calculate_feature_importances( + X, + Y, + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) } \arguments{ \item{X}{A data frame containing the features as columns.} @@ -16,6 +20,16 @@ calculate_feature_importances(X, Y, fi_method = fi_ranger_rf_lite(), \item{verbose}{Whether to print out extra information.} } +\value{ +A data frame with three columns, \code{predictor_id}, \code{feature_id}, and \code{importance}. \code{predictor_id} is a column in \code{Y}, while \code{feature_id} is a column in \code{X}. +} \description{ Calculate feature importance scores } +\examples{ +X <- data.frame(matrix(runif(25*10), ncol = 10)) +Y <- data.frame(matrix(runif(25*2), ncol = 2)) + +# don't run since this function is not exported +# calculate_feature_importances(X, Y) +} diff --git a/man/calculate_overall_feature_importance.Rd b/man/calculate_overall_feature_importance.Rd index 42fc9d6..f5c548b 100755 --- a/man/calculate_overall_feature_importance.Rd +++ b/man/calculate_overall_feature_importance.Rd @@ -14,30 +14,50 @@ \alias{calculate_waypoint_feature_importance} \title{Calculating feature importances across trajectories} \usage{ -calculate_branch_feature_importance(trajectory, - expression_source = "expression", fi_method = fi_ranger_rf_lite(), - verbose = FALSE) +calculate_branch_feature_importance( + trajectory, + expression_source = "expression", + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) -calculate_branching_point_feature_importance(trajectory, +calculate_branching_point_feature_importance( + trajectory, expression_source = "expression", milestones_oi = trajectory$milestone_ids, - fi_method = fi_ranger_rf_lite(), verbose = FALSE) + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) -calculate_cell_feature_importance(trajectory, - expression_source = "expression", fi_method = fi_ranger_rf_lite(), - verbose = FALSE) +calculate_cell_feature_importance( + trajectory, + expression_source = "expression", + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) -calculate_milestone_feature_importance(trajectory, - expression_source = "expression", milestones_oi = NULL, - fi_method = fi_ranger_rf_lite(), verbose = FALSE) +calculate_milestone_feature_importance( + trajectory, + expression_source = "expression", + milestones_oi = NULL, + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) -calculate_overall_feature_importance(trajectory, - expression_source = "expression", fi_method = fi_ranger_rf_lite(), - verbose = FALSE) +calculate_overall_feature_importance( + trajectory, + expression_source = "expression", + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) -calculate_waypoint_feature_importance(trajectory, - expression_source = "expression", waypoints = NULL, - fi_method = fi_ranger_rf_lite(), verbose = FALSE) +calculate_waypoint_feature_importance( + trajectory, + expression_source = "expression", + waypoints = NULL, + fi_method = fi_ranger_rf_lite(), + verbose = FALSE +) } \arguments{ \item{trajectory}{A trajectory object containing expression values and a trajectory.} @@ -57,8 +77,17 @@ calculate_waypoint_feature_importance(trajectory, \item{waypoints}{The waypoints, optional} } +\value{ +A data frame with two or more columns, \code{feature_id}, and \code{importance}. \code{feature_id} is a column in the trajectory expression matrix. Additional columns may be available depending on the function called. +} \description{ Uses the feature importance measures of \code{\link[ranger]{ranger}} or \code{caret}. \code{calculate_overall_feature_importance} calculates the importance for the whole trajectory, \code{calculate_milestone_feature_importance} calculates it for individual milestones (e.g. branching points) } +\examples{ +library(dynwrap) +data(example_trajectory) + +calculate_overall_feature_importance(example_trajectory) +} diff --git a/man/dynfeature.Rd b/man/dynfeature.Rd index 6c23920..74d7cec 100755 --- a/man/dynfeature.Rd +++ b/man/dynfeature.Rd @@ -3,8 +3,7 @@ \docType{package} \name{dynfeature} \alias{dynfeature} -\alias{dynfeature-package} -\title{Dynfeature feature importance} +\title{Feature Importance for Dynamic Processes} \description{ -Dynfeature feature importance +Calculating feature importance scores from trajectories using the random forests algorithm. } diff --git a/man/fi_methods.Rd b/man/fi_methods.Rd index aafac26..ce17fe7 100644 --- a/man/fi_methods.Rd +++ b/man/fi_methods.Rd @@ -4,14 +4,28 @@ \alias{fi_ranger_rf_lite} \alias{fi_ranger_rf} \alias{fi_caret} +\alias{fi_ranger_rf_tiny} \title{Feature Importance methods} \usage{ -fi_ranger_rf_lite(num_trees = 2000, num_variables_per_split = 50, - num_samples_per_tree = 250, min_node_size = 20, ...) +fi_ranger_rf_lite( + num_trees = 2000, + num_variables_per_split = 50, + num_samples_per_tree = 250, + min_node_size = 20, + ... +) fi_ranger_rf(...) fi_caret(caret_method, ...) + +fi_ranger_rf_tiny( + num_trees = 100, + num_variables_per_split = 50, + num_samples_per_tree = 250, + min_node_size = 20, + ... +) } \arguments{ \item{num_trees}{(fi_ranger_rf_lite) The number of trees to use} @@ -22,10 +36,19 @@ fi_caret(caret_method, ...) \item{min_node_size}{(fi_ranger_rf_lite) The minimum node size, no split will be made if the node size is less than this value.} -\item{...}{Extra parameters to pass onto the underlying feature importnce function.} +\item{...}{Extra parameters to pass onto the underlying feature importance function.} \item{caret_method}{(fi_caret) Which caret method to use for feature importance.} } +\value{ +A list containing a helper function for calling a feature importance function. +} \description{ Feature Importance methods } +\examples{ +library(dynwrap) +data(example_trajectory) + +calculate_overall_feature_importance(example_trajectory, fi_method = fi_ranger_rf()) +} diff --git a/tests/testthat/test-calculate_feature_importance.R b/tests/testthat/test-calculate_feature_importance.R index 4b7c898..94ef3e4 100755 --- a/tests/testthat/test-calculate_feature_importance.R +++ b/tests/testthat/test-calculate_feature_importance.R @@ -11,8 +11,8 @@ milestone_network <- tibble::tribble( "A", "D", 1, TRUE ) -num_cells <- 200 -num_features <- 300 +num_cells <- 100 +num_features <- 20 cell_ids <- paste0("cell_", seq_len(num_cells)) feature_ids <- paste0("feature_", seq_len(num_features)) @@ -39,7 +39,7 @@ trajectory <- ) # generate module positions -num_modules <- 20 +num_modules <- 10 module_ids <- paste0("module_", seq_len(num_modules)) module_progressions <- milestone_network %>% @@ -73,7 +73,7 @@ suppressWarnings({ module_expression <- distance_from_module %>% reshape2::melt(varnames = c("module_id", "cell_id"), value.name = "distance") %>% - as_data_frame() %>% + as_tibble(.name_repair = "minimal") %>% left_join(module_progressions %>% select(-from, -to, -percentage), by = "module_id") %>% mutate( expr = dnorm(distance, mean = 0, sd = sd) * mult + basal @@ -83,7 +83,7 @@ suppressWarnings({ module_expression <- module_expression[module_ids, cell_ids] # generate gene expression -feature_info <- data_frame( +feature_info <- tibble( feature_id = feature_ids, module = sample.int(num_modules, num_features, replace = TRUE), mean = rnorm(num_features, 10, 3) %>% pmax(0), @@ -95,7 +95,7 @@ expression <- feature_info %>% group_by(feature_id) %>% purrr::pmap_df(function(feature_id, module, mean, sd, dropout) { - data_frame( + tibble( cell_id = colnames(module_expression), feature_id, expression = ifelse(runif(num_cells) < dropout, 0, module_expression[module, , drop = FALSE] + rnorm(num_cells, mean, sd))