From 5219d540c403605944893808e7f2e8a1df0682b4 Mon Sep 17 00:00:00 2001 From: Cesaire Joris Kuete Fouodo Date: Wed, 17 Jul 2024 16:19:51 +0200 Subject: [PATCH] Variable selection --- README.Rmd | 53 +++++++++++++++++++++++++- README.md | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 2 deletions(-) diff --git a/README.Rmd b/README.Rmd index 5baa665..ca6e48e 100644 --- a/README.Rmd +++ b/README.Rmd @@ -38,7 +38,7 @@ devtools::install_github("imbs-hl/fuseMLR") ### Usage example -The following example is based on simulated data available in `fuseMLR`. Data have been simulated using the R package `InterSIM`, version 2.2.0. +The following example is based on simulated data available in `fuseMLR`. Data have been simulated using the R package `InterSIM`, version 2.2.0. - Let us inspect our simulated data. @@ -50,6 +50,12 @@ data("entities") str(object = entities, max.level = 2) ``` +Variable selection, training and prediction are the main functionalities of `fuseMLR`. As variable selection and training are performed for a training study, predictions are made for a new study. + +#### A) Preparation of a training study + +We need to set up a study, its layers and the training data entities. + - Instantiate a training study: A study is the fundamental component of a `fuseMLR` object. ```{r training_study, include=TRUE, eval=TRUE} @@ -92,6 +98,49 @@ print(train_study) - An upset plot of the training study: Visualize patient overlap across layers. -```{r upsetplot, include=TRUE, eval=TRUE} +```{r upsetplot, include=TRUE, eval=TRUE, } train_study$upset(order.by = "freq") ``` + +#### B) Variable selection + +We need to set up variable selection methods to our training study. Note that this can be the same method or different layer-specific methods. For simplicity, we will set up the same method on all layers. + +- Preparation parameters of the variable selection method. + +```{r varsel_param, include=TRUE, eval=TRUE} +same_param_varsel <- ParamVarSel$new(id = "ParamVarSel", + param_list = list(num.trees = 1000, mtry = 3)) +print(same_param_varsel) +``` + +- Instantiate the variable selection method and assign training layers. + +```{r varsel_object, include=TRUE, eval=TRUE} +varsel_ge <- VarSel$new(id = "varsel_geneexpr", + package = "Boruta", + varsel_fct = "Boruta", + param = same_param_varsel, + train_layer = tl_ge) + +varsel_pr <- VarSel$new(id = "varsel_geneexpr", + package = "Boruta", + varsel_fct = "Boruta", + param = same_param_varsel, + train_layer = tl_pr) + +varsel_me <- VarSel$new(id = "varsel_geneexpr", + package = "Boruta", + varsel_fct = "Boruta", + param = same_param_varsel, + train_layer = tl_me) +``` + +- Perform variable selection on our training study. + +```{r varsel, include=TRUE, eval=TRUE} +var_sel_res <- train_study$varSelection() +print(var_sel_res) +``` + +For each layer the variable selection results show which variable have been selected. diff --git a/README.md b/README.md index 05bde29..99b421f 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,14 @@ str(object = entities, max.level = 2) ## ..$ proteinexpr:'data.frame': 23 obs. of 162 variables: ## ..$ methylation:'data.frame': 23 obs. of 369 variables: +Variable selection, training and prediction are the main functionalities +of `fuseMLR`. As variable selection and training are performed for a +training study, predictions are made for a new study. + +#### A) Preparation of a training study + +We need to set up a study, its layers and the training data entities. + - Instantiate a training study: A study is the fundamental component of a `fuseMLR` object. @@ -163,3 +171,105 @@ train_study$upset(order.by = "freq") ``` ![](README_files/figure-gfm/upsetplot-1.png) + +#### B) Variable selection + +We need to set up variable selection methods to our training study. Note +that this can be the same method or different layer-specific methods. +For simplicity, we will set up the same method on all layers. + +- Preparation parameters of the variable selection method. + +``` r +same_param_varsel <- ParamVarSel$new(id = "ParamVarSel", + param_list = list(num.trees = 1000, mtry = 3)) +print(same_param_varsel) +``` + + ## Class: ParamVarSel + ## id : ParamVarSel + ## Parameter combination + ## $num.trees + ## [1] 1000 + ## + ## $mtry + ## [1] 3 + +- Instantiate the variable selection method and assign training layers. + +``` r +varsel_ge <- VarSel$new(id = "varsel_geneexpr", + package = "Boruta", + varsel_fct = "Boruta", + param = same_param_varsel, + train_layer = tl_ge) + +varsel_pr <- VarSel$new(id = "varsel_geneexpr", + package = "Boruta", + varsel_fct = "Boruta", + param = same_param_varsel, + train_layer = tl_pr) + +varsel_me <- VarSel$new(id = "varsel_geneexpr", + package = "Boruta", + varsel_fct = "Boruta", + param = same_param_varsel, + train_layer = tl_me) +``` + +- Perform variable selection on our training study. + +``` r +var_sel_res <- train_study$varSelection() +print(var_sel_res) +``` + + ## Layer variable + ## 1 geneexpr ACACA + ## 2 geneexpr BAP1 + ## 3 geneexpr CDH3 + ## 4 geneexpr CHEK2 + ## 5 geneexpr EIF4E + ## 6 geneexpr MAP2K1 + ## 7 geneexpr MAPK14 + ## 8 geneexpr PCNA + ## 9 geneexpr YWHAE + ## 10 geneexpr YWHAZ + ## 11 proteinexpr Bap1.c.4 + ## 12 proteinexpr Bid + ## 13 proteinexpr Cyclin_E2 + ## 14 proteinexpr P.Cadherin + ## 15 proteinexpr Chk1 + ## 16 proteinexpr Chk1_pS345 + ## 17 proteinexpr EGFR + ## 18 proteinexpr EGFR_pY1173 + ## 19 proteinexpr HER3_pY1289 + ## 20 proteinexpr MIG.6 + ## 21 proteinexpr ETS.1 + ## 22 proteinexpr MEK1_pS217_S221 + ## 23 proteinexpr p38_MAPK + ## 24 proteinexpr c.Met_pY1235 + ## 25 proteinexpr N.Ras + ## 26 proteinexpr PCNA + ## 27 proteinexpr PEA15_pS116 + ## 28 proteinexpr PKC.delta_pS664 + ## 29 proteinexpr Rad50 + ## 30 proteinexpr C.Raf_pS338 + ## 31 proteinexpr p70S6K + ## 32 proteinexpr p70S6K_pT389 + ## 33 proteinexpr Smad4 + ## 34 proteinexpr STAT3_pY705 + ## 35 proteinexpr 14.3.3_epsilon + ## 36 methylation cg20139214 + ## 37 methylation cg18457775 + ## 38 methylation cg01306510 + ## 39 methylation cg02412050 + ## 40 methylation cg07566050 + ## 41 methylation cg02630105 + ## 42 methylation cg20849549 + ## 43 methylation cg00547829 + ## 44 methylation cg25539131 + ## 45 methylation cg07064406 + +For each layer the variable selection results show which variable have +been selected.