feat: functional principal component analysis (#45)

* feat: added functional principal component analysis * feat: implement train_dt and predict_dt methods for FPCA * test: use expect_pipeop test for all implemented pipeops * docs(readme): rebuild readme * docs: add missing tf packge for fpca pipeop * fix: remove todo_comment_linter from defaults in lintr confg * fix: missing stats import and fix tests * docs: param docs for pca * docs: more description * refactor: styling * refactor: replace pcr var name with pc * docs(fpca): correct title and name * docs(readme): build readme * docs(readme): build readme * docs: fix docs * fix(fpca): make .args in invoke a list * docs: remove format section for fpca * docs for fpca * readme [skip ci] --------- Co-authored-by: Sebastian Fischer <[email protected]>
mlr-org · Mar 28, 2024 · 27dfc44 · 27dfc44
1 parent 84fa4a4
commit 27dfc44
Show file tree

Hide file tree

Showing 21 changed files with 333 additions and 42 deletions.
diff --git a/.lintr b/.lintr
@@ -1,11 +1,10 @@
 linters: linters_with_defaults(
-    # lintr defaults: https://github.com/jimhester/lintr#available-linters
+    # lintr defaults: https://lintr.r-lib.org/reference/default_linters.html
     # the following setup changes/removes certain linters
     assignment_linter = NULL, # do not force using <- for assignments
     object_name_linter = object_name_linter(c("snake_case", "CamelCase")), # only allow snake case and camel case object names
     cyclocomp_linter = NULL, # do not check function complexity
     commented_code_linter = NULL, # allow code in comments
-    todo_comment_linter = NULL, # allow todo in comments
-    line_length_linter = line_length_linter(120),
-    object_length_linter = object_length_linter(40)
+    line_length_linter = line_length_linter(120L),
+    object_length_linter = object_length_linter(40L)
     )
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -39,6 +39,7 @@ Collate:
     'PipeOpFDAFlatten.R'
     'PipeOpFDAInterpol.R'
     'PipeOpFDASmooth.R'
+    'PipeOpFPCA.R'
     'TaskClassif_phoneme.R'
     'TaskRegr_dti.R'
     'TaskRegr_fuel.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ export(PipeOpFDAExtract)
 export(PipeOpFDAFlatten)
 export(PipeOpFDAInterpol)
 export(PipeOpFDASmooth)
+export(PipeOpFPCA)
 import(R6)
 import(checkmate)
 import(data.table)

diff --git a/R/PipeOpFDAExtract.R b/R/PipeOpFDAExtract.R
@@ -36,6 +36,7 @@
 #' @export
 #' @examples
 #' library(mlr3pipelines)
+#'
 #' task = tsk("fuel")
 #' po_fmean = po("fda.extract", features = "mean")
 #' task_fmean = po_fmean$train(list(task))[[1L]]
@@ -57,6 +58,7 @@ PipeOpFDAExtract = R6Class("PipeOpFDAExtract",
     #'   Identifier of resulting object, default is `"fda.extract"`.
     #' @param param_vals (named `list`)\cr
     #'   List of hyperparameter settings, overwriting the hyperparameter settings that would
+    #'   otherwise be set during construction. Default `list()`.
     initialize = function(id = "fda.extract", param_vals = list()) {
       param_set = ps(
         drop = p_lgl(tags = c("train", "predict", "required")),
@@ -156,14 +158,7 @@ PipeOpFDAExtract = R6Class("PipeOpFDAExtract",
       })
       fextractor = make_fextractor(features)
 
-      features = map(
-        cols,
-        function(col) {
-          x = dt[[col]]
-          invoke(fextractor, x = x, left = left, right = right)
-        }
-      )
-
+      features = map(cols, function(col) invoke(fextractor, x = dt[[col]], left = left, right = right))
       features = unlist(features, recursive = FALSE)
       features = set_names(features, feature_names)
       features = as.data.table(features)
@@ -188,19 +183,15 @@ make_fextractor = function(features) {
       upper = interval[[2L]]
 
       if (is.na(lower) || is.na(upper)) {
-        res = map(features, function(f) {
-          rep(NA_real_, length(x)) # no observation in the given interval [left, right]
-        })
+        res = map(features, function(f) rep(NA_real_, length(x))) # no observation in the given interval [left, right]
         return(res)
       }
 
       values = tf::tf_evaluations(x)
       arg = args[lower:upper]
       res = map(seq_along(x), function(i) {
         value = values[[i]]
-        map(features, function(f) {
-          f(arg = arg, value = value[lower:upper])
-        })
+        map(features, function(f) f(arg = arg, value = value[lower:upper]))
       })
       return(transform_list(res))
     }
@@ -217,9 +208,7 @@ make_fextractor = function(features) {
       if (is.na(lower) || is.na(upper)) {
         rep(NA_real_, length(features)) # no observation in the given interval [left, right]
       } else {
-        map(features, function(f) {
-          f(arg = arg[lower:upper], value = value[lower:upper])
-        })
+        map(features, function(f) f(arg = arg[lower:upper], value = value[lower:upper]))
       }
     })
     transform_list(res)

diff --git a/R/PipeOpFDAFlatten.R b/R/PipeOpFDAFlatten.R
@@ -19,6 +19,7 @@
 #' @export
 #' @examples
 #' library(mlr3pipelines)
+#'
 #' task = tsk("fuel")
 #' pop = po("fda.flatten")
 #' task_flat = pop$train(list(task))

diff --git a/R/PipeOpFDAInterpol.R b/R/PipeOpFDAInterpol.R
@@ -42,9 +42,10 @@
 #' @export
 #' @examples
 #' library(mlr3pipelines)
+#'
 #' task = tsk("fuel")
 #' pop = po("fda.interpol")
-#' task_interpol = pop$train(list(task))[[1]]
+#' task_interpol = pop$train(list(task))[[1L]]
 #' task_interpol$data()
 PipeOpFDAInterpol = R6Class("PipeOpFDAInterpol",
   inherit = mlr3pipelines::PipeOpTaskPreprocSimple,

diff --git a/R/PipeOpFDASmooth.R b/R/PipeOpFDASmooth.R
@@ -27,6 +27,7 @@
 #' @export
 #' @examples
 #' library(mlr3pipelines)
+#'
 #' task = tsk("fuel")
 #' po_smooth = po("fda.smooth", method = "rollmean", args = list(k = 5))
 #' task_smooth = po_smooth$train(list(task))[[1L]]

diff --git a/R/PipeOpFPCA.R b/R/PipeOpFPCA.R
@@ -0,0 +1,91 @@
+#' @title Functional Principal Component Analysis
+#' @name mlr_pipeops_fda.fpca
+#'
+#' @description
+#' This `PipeOp` applies a functional principal component analysis (FPCA) to functional columns and then
+#' extracts the principal components as features. This is done using a (truncated) weighted SVD.
+#'
+#' To apply this `PipeOp` to irregualr data, convert it to a regular grid first using [`PipeOpFDAInterpol`].
+#'
+#' For more details, see [`tfb_fpc()`][tf::tfb_fpc], which is called internally.
+#'
+#'
+#' @section Parameters:
+#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as the following parameters:
+#' * `pve` :: `numeric(1)` \cr
+#'   The percentage of variance explained that should be retained. Default is `0.995`.
+#' * `n_components` :: `integer(1)` \cr
+#'   The number of principal components to extract. This parameter is initialized to `Inf`.
+#'
+#' @section Naming:
+#' The new names generally append a `_pc_{number}` to the corresponding column name.
+#' If a column was called `"x"` and the there are three principcal components, the corresponding
+#' new columns will be called `"x_pc_1", "x_pc_2", "x_pc_3"`.
+#'
+#' @export
+#' @examples
+#' library(mlr3pipelines)
+#'
+#' task = tsk("fuel")
+#' po_fpca = po("fda.fpca")
+#' task_fpca = po_fpca$train(list(task))[[1L]]
+#' task_fpca$data()
+PipeOpFPCA = R6Class("PipeOpFPCA",
+  inherit = mlr3pipelines::PipeOpTaskPreproc,
+  public = list(
+    #' @description Initializes a new instance of this Class.
+    #' @param id (`character(1)`)\cr
+    #'   Identifier of resulting object, default is `"fda.fpca"`.
+    #' @param param_vals (named `list`)\cr
+    #'   List of hyperparameter settings, overwriting the hyperparameter settings that would
+    #'   otherwise be set during construction. Default `list()`.
+    initialize = function(id = "fda.fpca", param_vals = list()) {
+      param_set = ps(
+        pve = p_dbl(default = 0.995, lower = 0, upper = 1, tags = "train"),
+        n_components = p_int(1L, special_vals = list(Inf), tags = c("train", "required"))
+      )
+      param_set$set_values(n_components = Inf)
+
+      super$initialize(
+        id = id,
+        param_set = param_set,
+        param_vals = param_vals,
+        packages = c("mlr3fda", "mlr3pipelines", "tf"),
+        feature_types = "tfd_reg",
+        tags = "fda"
+      )
+    }
+  ),
+  private = list(
+    .train_dt = function(dt, levels, target) {
+      pars = self$param_set$get_values(tags = "train")
+
+      dt = map_dtc(dt, function(x, nm) invoke(tf::tfb_fpc, data = x, .args = remove_named(pars, "n_components")))
+      self$state = list(fpc = dt)
+
+      dt = imap_dtc(dt, function(col, nm) {
+        map(col, function(x) {
+          pc = as.list(x[2:min(pars$n_components + 1L, length(x))])
+          set_names(pc, sprintf("%s_pc_%d", nm, seq_along(pc)))
+        })
+      })
+      unnest(dt, colnames(dt))
+    },
+
+    .predict_dt = function(dt, levels) {
+      pars = self$param_set$get_values()
+
+      dt = imap_dtc(dt, function(col, nm) {
+        fpc = tf::tf_rebase(col, self$state$fpc[[nm]], arg = tf::tf_arg(col))
+        map(fpc, function(x) {
+          pc = as.list(x[2:min(pars$n_components + 1L, length(x))])
+          set_names(pc, sprintf("%s_pc_%d", nm, seq_along(pc)))
+        })
+      })
+      unnest(dt, colnames(dt))
+    }
+  )
+)
+
+#' @include zzz.R
+register_po("fda.fpca", PipeOpFPCA)
diff --git a/R/TaskRegr_dti.R b/R/TaskRegr_dti.R
@@ -36,7 +36,7 @@ load_task_dti = function(id = "dti") {
     rcst = tf::tfd(dti$rcst, arg = seq(0L, 1L, length.out = 55L)),
     sex = dti$sex
   )
-  dti = na.omit(dti)
+  dti = stats::na.omit(dti)
   b = as_data_backend(dti)
 
   task = TaskRegr$new(

diff --git a/README.Rmd b/README.Rmd
@@ -25,7 +25,6 @@ Package Website: [dev](https://mlr3fda.mlr-org.com/)
 Extending mlr3 to functional data.
 
 <!-- badges: start -->
-[![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental)
 [![RCMD Check](https://github.com/mlr-org/mlr3fda/actions/workflows/rcmdcheck.yaml/badge.svg)](https://github.com/mlr-org/mlr3fda/actions/workflows/rcmdcheck.yaml)
 [![CRAN status](https://www.r-pkg.org/badges/version/mlr3fda)](https://CRAN.R-project.org/package=mlr3fda)
 [![StackOverflow](https://img.shields.io/badge/stackoverflow-mlr3-orange.svg)](https://stackoverflow.com/questions/tagged/mlr3)

diff --git a/README.md b/README.md
@@ -7,8 +7,6 @@ Extending mlr3 to functional data.
 
 <!-- badges: start -->
 
-[![Lifecycle:
-experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental)
 [![RCMD
 Check](https://github.com/mlr-org/mlr3fda/actions/workflows/rcmdcheck.yaml/badge.svg)](https://github.com/mlr-org/mlr3fda/actions/workflows/rcmdcheck.yaml)
 [![CRAN
@@ -127,6 +125,7 @@ glrn$predict(task, row_ids = ids$test)
 |:-------------------------------------------------------------------------------|:-------------------------------------------------|:---------------------------------------------------|:--------------------|
 | [fda.extract](https://mlr3fda.mlr-org.com/reference/mlr_pipeops_fda.extract)   | Extracts Simple Features from Functional Columns | [tf](https://cran.r-project.org/package=tf)        | fda, data transform |
 | [fda.flatten](https://mlr3fda.mlr-org.com/reference/mlr_pipeops_fda.flatten)   | Flattens Functional Columns                      | [tf](https://cran.r-project.org/package=tf)        | fda, data transform |
+| [fda.fpca](https://mlr3fda.mlr-org.com/reference/mlr_pipeops_fda.fpca)         | Functional Principal Component Analysis          | [tf](https://cran.r-project.org/package=tf)        | fda, data transform |
 | [fda.interpol](https://mlr3fda.mlr-org.com/reference/mlr_pipeops_fda.interpol) | Interpolate Functional Columns                   | [tf](https://cran.r-project.org/package=tf)        | fda, data transform |
 | [fda.smooth](https://mlr3fda.mlr-org.com/reference/mlr_pipeops_fda.smooth)     | Smoothing Functional Columns                     | [tf](https://cran.r-project.org/package=tf), stats | fda, data transform |
 

diff --git a/man/mlr_pipeops_fda.extract.Rd b/man/mlr_pipeops_fda.extract.Rd
diff --git a/man/mlr_pipeops_fda.flatten.Rd b/man/mlr_pipeops_fda.flatten.Rd
diff --git a/man/mlr_pipeops_fda.fpca.Rd b/man/mlr_pipeops_fda.fpca.Rd
diff --git a/man/mlr_pipeops_fda.interpol.Rd b/man/mlr_pipeops_fda.interpol.Rd
diff --git a/man/mlr_pipeops_fda.smooth.Rd b/man/mlr_pipeops_fda.smooth.Rd