From 62a40d3e8bc4c547b7d2dfed87fa5e797429bb18 Mon Sep 17 00:00:00 2001 From: Jim Porzak Date: Sat, 20 Jun 2015 10:30:41 -0700 Subject: [PATCH] prefix packge:: --- .Rbuildignore | 2 +- .gitignore | 1 + DESCRIPTION | 5 +- ...ustomerPreferenceSegmentsWithFlexclust.Rmd | 124 ------- ...ustomerPreferenceSegmentsWithFlexclust.Rmd | 337 ------------------ R/fc_rclust.R | 41 +-- R/fc_reorder.R | 27 ++ R/fc_stable.R | 44 +-- R/multiplotTest.R | 52 --- R/stable1.R | 76 ---- vignettes/Cluster-Volunteers-Data-Set.Rmd | 58 +++ 11 files changed, 134 insertions(+), 633 deletions(-) delete mode 100644 R/AutoCustomerPreferenceSegmentsWithFlexclust.Rmd delete mode 100644 R/VolunteersCustomerPreferenceSegmentsWithFlexclust.Rmd create mode 100644 R/fc_reorder.R delete mode 100644 R/multiplotTest.R delete mode 100644 R/stable1.R create mode 100644 vignettes/Cluster-Volunteers-Data-Set.Rmd diff --git a/.Rbuildignore b/.Rbuildignore index d85e0be..9a33ad8 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,6 +8,6 @@ ^Results$ Flexclust\_cache$ Flexclust\_files$ -^.*\.Rmd$ ^.*\.docx$ ^.*\.html$ + diff --git a/.gitignore b/.gitignore index f1a9e4f..e054fae 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ Notes/ *_cache/ *_files/ +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index 02bb4af..a31cfef 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,9 +8,12 @@ Maintainer: Jim Porzak Description: A set of tools for doing customer segmentatio with R. Initially focused on using flexclust package. License: GPL-2 LazyData: TRUE -Imports: +Imports: dplyr, flexclust, + grid, ggplot2, MASS, tidyr +Suggests: knitr +VignetteBuilder: knitr diff --git a/R/AutoCustomerPreferenceSegmentsWithFlexclust.Rmd b/R/AutoCustomerPreferenceSegmentsWithFlexclust.Rmd deleted file mode 100644 index 04adaba..0000000 --- a/R/AutoCustomerPreferenceSegmentsWithFlexclust.Rmd +++ /dev/null @@ -1,124 +0,0 @@ ---- -title: "Customer Preference Segments with flexclust - auto" -author: "Jim Porzak" -date: "June 6, 2015" -output: word_document ---- - -```{r echo=FALSE, message=FALSE, warning=FALSE} -# overall set-up -library(flexclust) -library(stringr) -library(dplyr) -library(tidyr) -library(ggplot2) -data("auto") -auto_ch <- auto[str_detect(colnames(auto), fixed("ch_"))] -colnames(auto_ch) <- str_replace(colnames(auto_ch), "^ch_", "") -``` - -### Introduction - -Customer preference surveys ask a question like "Please check off of the following [things] which are [important] -for you." This example uses the _auto_ data set from package _flexclust_. From the package documentation: - -> A German manufacturer of premium cars asked customers approximately 3 months after a car purchase which characteristics of the car were most important for the decision to buy the car. The survey was done in 1983 and the data set contains all 793 responses without missing values. - -One section of the survey has `r ncol(auto_ch)` preference check boxes for characteristics of the car which were important -in their decision. These are: `r paste(colnames(auto_ch), collapse = ", ")`. - -We will show how the _flexclust_ package ... - -**NTS: include references to Fritz's package & papers & Sara & Fritz's marketing application papers.** - -### An example _flexclust_ segmentation run - -First, make a data frame with just the preference choice columns in _auto_. -``` {r} -library(flexclust) -data("auto") -auto_ch <- auto[str_detect(colnames(auto), fixed("ch_"))] -colnames(auto_ch) <- str_replace(colnames(auto_ch), "^ch_", "") -auto_ch[1:5, 1:7] -## and get integer matrix version for flexclust stuff -auto.mat <- as.matrix(sapply(auto_ch, as.integer)) -auto.mat[1:5, 1:7] -``` - -Plot the percent of each preference that is checked. - -``` {r echo = FALSE} -auto_ch.pcts <- auto_ch %>% - gather("Question", "Response") %>% - group_by(Question) %>% - summarize(Pct_Checked = 100.0 * sum(Response) / n()) - -ggplot(auto_ch.pcts, aes(x = Pct_Checked, y = reorder(Question, -as.integer(Question)))) + - geom_point(size = 3) + coord_cartesian(xlim = c(0, 100)) + - ylab("Preference Question") + xlab("% Checked") + ggtitle("Response to Each Preference") -``` - -Set the parameters for the cluster algorithm. -``` {r} -fc_cont <- new("flexclustControl") ## flexclustControl object holds "hyperparameters" -fc_cont@tolerance <- 0.001 -fc_cont@iter.max <- 30 -fc_cont@verbose <- 1 - -fc_family <- "ejaccard" ## distance metric -``` -Now we can invoke kcca to do clustering. Start with three clusters to show the process. -``` {r} -fc_seed <- 123 -num_clusters <- 3 - -set.seed(fc_seed) - -## verbose > 0 will show iterations -auto.cl <- kcca(auto.mat, k = num_clusters, save.data = TRUE, - control = fc_cont, family = kccaFamily(fc_family)) - -summary(auto.cl) -``` - -``` {r echo = FALSE} -# set up plot titles for this run -main_text1 <- "Auto Stated Preferences Survey" -sub_text <- paste0("kcca ", auto.cl@family@name, " - ", num_clusters, " clusters (seed = ", fc_seed, ")") - -``` -#### Segment separation plot (aka neighborhood plot) - -``` {r fig.width = 7, fig.height = 7.7} -auto.pca <- prcomp(auto.mat) ## plot on first two principal components -plot(auto.cl, data = auto.mat, project = auto.pca, - main = paste0(main_text1, " - Segment Seperation Plot"), - sub = sub_text) -``` - -This plot shows each respondent's preferences plotted on the surface defined by the first two principal components. -Centroids of each cluster (segment) are the numbered circles. The color indicates the respondent's cluster membership. The solid line (convex hull) encloses 50% of the points in the cluster while the dashed line encloses 95% of the points. The separation between each cluster is indicated by the thinness of the line between any two centroids. Of course, the physical distance between centroids on the PC2 x PC1 plane does not correspond to the actual separation in any non-trivial problem. - -#### Segment profile plot - the primary tool for interpreting the solution as customer segments or persona. - -``` {r fig.width = 7, fig.height = 5} -barchart(auto.cl, strip.prefix = "#", shade = TRUE, - main = paste0(main_text1, " - Segment Profile Plot")) -``` - - -This plot has a facet for each cluster plotted above. The colors are consistent between the two plots. The header in each facet gives the cluster number, the number of respondents assigned to that cluster and the percent of the total respondents. For each of the preferences, the bar width shows the proportion checked for that cluster. For reference, the overall population proportion is shown by the red line & dot. (which corresponds to the dot plot above). - -A bar is grayed out when the preference is not important is distinguishing a cluster from the others. This makes it easy to focus on the preferences which lead to a customer segmentation (or persona) story. - -> Exercise for the reader: How would you describe each of these three segments? - -These are the basic ideas behind using _flexclust_ to segment customers based on results of a stated preference survey. -Next we will look at two important practical issues: -1. Is the cluster solution stable for a given number of clusters (k)? -2. How many clusters best describe the respondents? - -### Stability of the solutions for any k - -> At first glance, the _auto_ data set does not yeald stable solutions at various k's. Switching efforts over to the -volunteers_ data set. - diff --git a/R/VolunteersCustomerPreferenceSegmentsWithFlexclust.Rmd b/R/VolunteersCustomerPreferenceSegmentsWithFlexclust.Rmd deleted file mode 100644 index e56beaf..0000000 --- a/R/VolunteersCustomerPreferenceSegmentsWithFlexclust.Rmd +++ /dev/null @@ -1,337 +0,0 @@ ---- -title: "Customer Preference Segments with flexclust - volunteers" -author: "Jim Porzak" -date: "June 9, 2015" -output: html_document ---- - -```{r echo=FALSE, message=FALSE, warning=FALSE} -# overall set-up -library(MASS) -library(flexclust) -library(stringr) -library(dplyr) -library(tidyr) -library(ggplot2) -data("volunteers") -vol_ch <- volunteers[-(1:2)] - -# internal functions -fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plotme=TRUE){ - fc_seed = seed - cli_trys <- NULL - for (itry in 1:nrep) { - fc_seed <- fc_seed + 1 - set.seed(fc_seed) - cli <- flexclust::kcca(x, k, save.data = TRUE, - control = fc_cont, family = kccaFamily(fc_family)) - cli_info <- cli@clusinfo %>% - mutate(clust_num = row_number(), - clust_rank = rank(desc(size), ties.method = "first")) %>% - arrange(clust_rank) %>% - dplyr::select(c(6, 5, 1:4)) - cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed), - cli_info) - cli_trys <- rbind(cli_trys, cli_try) - } - cli_trys <- as.tbl(cli_trys) - - cli_sizes <- cli_trys %>% - dplyr::select(k, seed, clust_num, clust_rank, size) %>% - filter(clust_rank <= 2) %>% - mutate(clust_label = paste0("Size_", clust_rank), - in_order = clust_num == clust_rank) %>% - dplyr::select(-clust_rank, -clust_num) %>% - spread(key = clust_label, value = size) %>% - group_by(k, seed) %>% - summarize(in_order = all(in_order), - Size_1 = min(Size_1, na.rm = TRUE), - Size_2 = min(Size_2, na.rm = TRUE)) - - # get location of peak numerically with MASS:kde2d - s2d <- with(cli_sizes, MASS::kde2d(Size_1, Size_2, n = 100)) - s2d_peak <- which(s2d$z == max(s2d$z)) - Size_1_peak_at <- round(s2d$x[s2d_peak %% 100], 1) - Size_2_peak_at <- round(s2d$y[s2d_peak %/% 100], 1) - - if(plotme) { - xend <- Size_1_peak_at + 100 - yend <- Size_2_peak_at + 100 - p <- ggplot2::ggplot(cli_sizes, aes(Size_1, Size_2)) + - geom_point(alpha = 0.5, size = 2) + - stat_density2d() + - annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at, - xend = xend, yend = yend, color = "red", size = 1) + - annotate("text", xend, yend, - label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) + - ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", k, ", # tries=", nrep)) - print(p) - } - - cli_best <- cli_sizes %>% - filter(in_order) %>% ## just look at solutions with top 2 clusters in decending sizes - ## until we figure out how to re-arrange clusters in kcca object - mutate(distance = sqrt((Size_1 - Size_1_peak_at)^2 + (Size_2 - Size_2_peak_at)^2)) %>% - arrange(distance) - - return(list(best = cli_best, - sizes = cli_sizes, - peak_at = c(Size_1_peak_at, Size_2_peak_at), - tries = cli_trys)) -} -``` - -### Introduction - -Customer preference surveys ask a question like "Please check off of the following [things] which are [important] -for you." This example uses the _volunteers_ data set from package _flexclust_. From the package documentation: - -> Part of an Australian survey on motivation of 1415 volunteers to work for non-profit organisations like Red Cross, State Emergency Service, Rural Fire Service, Surf Life Saving, Rotary, Parents and Citizens Associations, etc.. - -The survey has `r ncol(vol_ch)` preference check boxes for various motivations which were important motivators in their decision to volunteer. These are: `r paste(colnames(vol_ch), collapse = ", ")`. - -We will show how the _flexclust_ package ... - -**NTS: include references to Fritz's package & papers & Sara & Fritz's marketing application papers.** - -### An example _flexclust_ segmentation run - -First, make a data frame with just the preference choice columns in _vol_. -``` {r} -library(flexclust) -data("volunteers") -vol_ch <- volunteers[-(1:2)] -vol_ch[1:5, 1:7] -vol.mat <- as.matrix(vol_ch) -``` - -Plot the percent of each preference that is checked. - -``` {r echo = FALSE} -vol_ch.pcts <- vol_ch %>% - gather("Question", "Response") %>% - group_by(Question) %>% - summarize(Pct_Checked = 100.0 * sum(Response) / n()) - -ggplot(vol_ch.pcts, aes(x = Pct_Checked, y = reorder(Question, -as.integer(Question)))) + - geom_point(size = 3) + coord_cartesian(xlim = c(0, 100)) + - ylab("Preference Question") + xlab("% Checked") + ggtitle("Response to Each Preference") -``` - -Set the parameters for the cluster algorithm. -``` {r} -fc_cont <- new("flexclustControl") ## flexclustControl object holds "hyperparameters" -fc_cont@tolerance <- 0.1 ## kcca only uses if classify == "weighted" -fc_cont@iter.max <- 30 -fc_cont@verbose <- 1 - -fc_family <- "ejaccard" ## distance metric -``` -Now we can invoke kcca to do clustering. Start with three clusters to show the process. -``` {r} -seed1 <- 577 #243 ## Why we use this seed will become clear below -fc_seed <- seed1 - -num_clusters <- 3 - -set.seed(fc_seed) - -## verbose > 0 will show iterations -vol.cl <- kcca(vol.mat, k = num_clusters, save.data = TRUE, - control = fc_cont, family = kccaFamily(fc_family)) - -summary(vol.cl) -``` - -``` {r echo = FALSE} -# set up plot titles for this run -main_text1 <- "Volunteers Stated Preferences Survey" -sub_text <- paste0("kcca ", vol.cl@family@name, " - ", num_clusters, " clusters (seed = ", fc_seed, ")") - -``` -#### Segment separation plot (aka neighborhood plot) - -``` {r fig.width = 7, fig.height = 7.7} -vol.pca <- prcomp(vol.mat) ## plot on first two principal components -plot(vol.cl, data = vol.mat, project = vol.pca, - main = paste0(main_text1, " - Segment Seperation Plot"), - sub = sub_text) -``` - -This plot shows each respondent's preferences plotted on the surface defined by the first two principal components. -Centroids of each cluster (segment) are the numbered circles. The color indicates the respondent's cluster membership. The solid line (convex hull) encloses 50% of the points in the cluster while the dashed line encloses 95% of the points. The separation between each cluster is indicated by the thinness of the line between any two centroids. Of course, the physical distance between centroids on the PC2 x PC1 plane does not correspond to the actual separation in any non-trivial problem. - -#### Segment profile plot - the primary tool for interpreting the solution as customer segments or persona. - -``` {r fig.width = 8, fig.height = 6} -barchart(vol.cl, strip.prefix = "#", shade = TRUE, layout = c(vol.cl@k, 1), - main = paste0(main_text1, " - Segment Profile Plot")) -``` - - -This plot has a facet for each cluster plotted above. The colors are consistent between the two plots. The header in each facet gives the cluster number, the number of respondents assigned to that cluster and the percent of the total respondents. For each of the preferences, the bar width shows the proportion checked for that cluster. For reference, the overall population proportion is shown by the red line & dot. (which corresponds to the dot plot above). - -A bar is grayed out when the preference is not important is distinguishing a cluster from the others. But note that grayed bar(s) may well be important when coming up with the customer segmentation (or persona) story. - -> Exercise for the reader: How would you describe each of these three segments? - -These are the basic ideas behind using _flexclust_ to segment customers based on results of a stated preference survey. -Next we will look at two important practical issues: -1. Is the cluster solution stable for a given number of clusters (k)? -2. How many clusters best describe the respondents? - -### Stability of the solutions for any k -#### The Stability Problem -If we run kcca() on the same data but with different starting seeds for any real customer dataset (in which clusters are typically somewhat fuzzy), we expect to get different solutions. For example comparing the above solution with two other solutions, we see the results are somewhat different: - -```{r echo = FALSE, fig.width = 5, fig.height = 5} -fc_cont@verbose <- 0 -set.seed(fc_seed) -vol.cl <- kcca(vol.mat, k = num_clusters, save.data = TRUE, - control = fc_cont, family = kccaFamily(fc_family)) -vol.pca <- prcomp(vol.mat) ## plot on first two principal components -plot(vol.cl, data = vol.mat, project = vol.pca, - main = paste0("Segment Seperation Plot, k=", num_clusters, ", seed=", fc_seed)) -barchart(vol.cl, strip.prefix = "#", shade = TRUE, layout = c(3, 1), - main = paste0("Segment Profile Plot, k=", num_clusters, ", seed=", fc_seed)) - -fc_seed <- 215 -set.seed(fc_seed) -vol.cl <- kcca(vol.mat, k = num_clusters, save.data = TRUE, - control = fc_cont, family = kccaFamily(fc_family)) -vol.pca <- prcomp(vol.mat) ## plot on first two principal components -plot(vol.cl, data = vol.mat, project = vol.pca, - main = paste0("Segment Seperation Plot, k=", num_clusters, ", seed=", fc_seed)) -barchart(vol.cl, strip.prefix = "#", shade = TRUE, layout = c(3, 1), - main = paste0("Segment Profile Plot, k=", num_clusters, ", seed=", fc_seed)) - -fc_seed <- 129 -set.seed(fc_seed) -vol.cl <- kcca(vol.mat, k = num_clusters, save.data = TRUE, - control = fc_cont, family = kccaFamily(fc_family)) -vol.pca <- prcomp(vol.mat) ## plot on first two principal components -plot(vol.cl, data = vol.mat, project = vol.pca, - main = paste0("Segment Seperation Plot, k=", num_clusters, ", seed=", fc_seed)) -barchart(vol.cl, strip.prefix = "#", shade = TRUE, layout = c(3, 1), - main = paste0("Segment Profile Plot, k=", num_clusters, ", seed=", fc_seed)) - -``` - -These three examples are similar in that there a single predominate cluster but looking closer there are -important differences in how each solution is interpreted. - -#### One Simple Solution - -We need a method to explore many solutions and look for a frequent stable solution. The challenge is coming up with a measure of solution similarty. kcca() returns an object with 17 slots, so there are a lot of metrics we could devise. - -Keeping it very simple, let's just look at the scatter plot of the number of members in the two largest clusters. This is easy to get from slot @clusinfo: - -`r str(vol.cl@clusinfo)` - -The plan is to will run kcca() a few hundred times, incrementing the seed with each run to get data for the plot. First we build up a data.frame capturing @clusinfo for each run, where the run is identified by the values or k and the seed. - -```{r echo=TRUE, cache = TRUE} -fc_seed <- 123 -num_clusters <- 3 -num_trys <- 500 -fc_cont@verbose <- 0 -cli_trys <- NULL - -# build df with cluster info for each seed -for (itry in 1:num_trys) { - fc_seed <- fc_seed + 1 - set.seed(fc_seed) - cli <- kcca(vol.mat, k = num_clusters, save.data = TRUE, - control = fc_cont, family = kccaFamily(fc_family)) - cli_info <- cli@clusinfo %>% - mutate(clust_num = row_number(), - clust_rank = rank(desc(size), ties.method = "first")) %>% - arrange(clust_rank) %>% - dplyr::select(c(6, 5, 1:4)) - cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed), - cli_info) - cli_trys <- rbind(cli_trys, cli_try) -} -cli_trys <- as.tbl(cli_trys) -cli_trys -``` - -It is ordered by seed (aka runID) and the cluster rank. "clust_num" is the original cluster sequence number coming out of kcca(). Notice that cluster #1 is not necessarly the largest cluster. (This is an anoyance with the "random walk" nature of kcca(). The same cluster, based on its properties, will not necessarly be in the sequence between runs.) - -We need to massage cli_trys so it is suitable for plotting. At the same time we pick up the location of the peak. - -``` {r} -# set up plot of size of rank 2 x rank 1 -cli_sizes <- cli_trys %>% - dplyr::select(k, seed, clust_num, clust_rank, size) %>% - filter(clust_rank <= 2) %>% - mutate(clust_label = paste0("Size_", clust_rank), - in_order = clust_num == clust_rank) %>% - dplyr::select(-clust_rank, -clust_num) %>% - spread(key = clust_label, value = size) %>% - group_by(k, seed) %>% - summarize(in_order = all(in_order), - Size_1 = min(Size_1, na.rm = TRUE), - Size_2 = min(Size_2, na.rm = TRUE)) - -# get location of peak numerically with MASS:kde2d -s2d <- with(cli_sizes, kde2d(Size_1, Size_2, n = 100)) -s2d_peak <- which(s2d$z == max(s2d$z)) -Size_1_peak_at <- round(s2d$x[s2d_peak %% 100], 1) -Size_2_peak_at <- round(s2d$y[s2d_peak %/% 100], 1) -``` - -From which we can plot: - -```{r echo=FALSE, fig.width = 8, fig.height = 6} -xend <- Size_1_peak_at + 100 -yend <- Size_2_peak_at + 100 -ggplot(cli_sizes, aes(Size_1, Size_2)) + - geom_point(alpha = 0.5, size = 2) + - stat_density2d() + - annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at, - xend = xend, yend = yend, color = "red", size = 1) + - annotate("text", xend, yend, label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) + - ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", num_clusters, ", # tries=", num_trys)) -``` - -Now we just need the distance of each solution's first & second cluster counts to the peak we found above: - -``` {r} -cli_best <- cli_sizes %>% - filter(in_order) %>% ## just look at solutions with clusters in decending sizes - mutate(distance = sqrt((Size_1 - Size_1_peak_at)^2 + (Size_2 - Size_2_peak_at)^2)) %>% - arrange(distance) -cli_best -``` - -Taking the value pair (k = `r cli_best[1, 1]`, seed = `r cli_best[1, 2]`) from solution closest to the peak (the top row), gives us the parameters we need to re-generate the plots for the "stable clustering" for a given k. We apply this method in "production" mode next. - - -### Stable Clusters for k = 2, 3, 4, ... - -``` {r echo=FALSE, fig.width = 8, fig.height = 6, cache=TRUE} -for(k in 2:15) { - cat("k =", k) - num_clusters <- k - cli <- fc_rclust(vol.mat, k=num_clusters, nrep=200) - cli - head(cli$best, 3) - fc_seed <- as.integer(cli$best[1, 2]) - x <- try(set.seed(fc_seed)) - if(!is.null(x)) { - cat("set.seed error, invalid cli$best. Skipping k = ", k) - next() - } - vol.cl <- kcca(vol.mat, k = num_clusters, save.data = TRUE, - control = fc_cont, family = kccaFamily(fc_family)) - vol.pca <- prcomp(vol.mat) ## plot on first two principal components - plot(vol.cl, data = vol.mat, project = vol.pca, - main = paste0("Segment Seperation Plot, k=", num_clusters, ", seed=", fc_seed)) - bc <- barchart(vol.cl, strip.prefix = "#", shade = TRUE, layout = c(num_clusters, 1), - main = paste0("Segment Profile Plot, k=", num_clusters, ", seed=", fc_seed)) - print(bc) -} -``` - - diff --git a/R/fc_rclust.R b/R/fc_rclust.R index 228b112..2dbe4ce 100644 --- a/R/fc_rclust.R +++ b/R/fc_rclust.R @@ -1,4 +1,5 @@ -fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plotme=TRUE){ + +fc_rclust <- function(x, k, fc_cont, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plotme=TRUE){ fc_seed = seed fc_tries <- NULL for (itry in 1:nrep) { @@ -7,9 +8,9 @@ fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plot cli <- flexclust::kcca(x, k, save.data = TRUE, control = fc_cont, family = kccaFamily(fc_family)) cli_info <- cli@clusinfo %>% - mutate(clust_num = row_number(), - clust_rank = min_rank(desc(size))) %>% - arrange(clust_rank) %>% + dplyr::mutate(clust_num = row_number(), + clust_rank = min_rank(desc(size))) %>% + dplyr::arrange(clust_rank) %>% dplyr::select(c(6, 5, 1:4)) cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed), cli_info) @@ -19,15 +20,15 @@ fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plot cli_sizes <- cli_trys %>% dplyr::select(k, seed, clust_num, clust_rank, size) %>% - filter(clust_rank <= 2) %>% - mutate(clust_label = paste0("Size_", clust_rank), - in_order = clust_num == clust_rank) %>% + dplyr::filter(clust_rank <= 2) %>% + dplyr::mutate(clust_label = paste0("Size_", clust_rank), + in_order = clust_num == clust_rank) %>% dplyr::select(-clust_rank, -clust_num) %>% - spread(key = clust_label, value = size) %>% - group_by(k, seed) %>% - summarize(in_order = all(in_order), - Size_1 = min(Size_1, na.rm = TRUE), - Size_2 = min(Size_2, na.rm = TRUE)) + tidyr::spread(key = clust_label, value = size) %>% + dplyr::group_by(k, seed) %>% + dplyr::summarize(in_order = all(in_order), + Size_1 = min(Size_1, na.rm = TRUE), + Size_2 = min(Size_2, na.rm = TRUE)) # get location of peak numerically with MASS:kde2d s2d <- with(cli_sizes, MASS::kde2d(Size_1, Size_2, n = 100)) @@ -39,20 +40,20 @@ fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plot xend <- Size_1_peak_at + 100 yend <- Size_2_peak_at + 100 p <- ggplot2::ggplot(cli_sizes, aes(Size_1, Size_2)) + - geom_point(alpha = 0.5, size = 2) + - stat_density2d() + - annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at, + ggplot2::geom_point(alpha = 0.5, size = 2) + + ggplot2::stat_density2d() + + ggplot2::annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at, xend = xend, yend = yend, color = "red", size = 1) + - annotate("text", xend, yend, + ggplot2::annotate("text", xend, yend, label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) + - ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", k, ", # tries=", nrep)) + ggplot2::ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", k, ", # tries=", nrep)) print(p) } cli_best <- cli_sizes %>% - filter(in_order) %>% ## just look at solutions with clusters in decending sizes - mutate(distance = sqrt((Size_1 - Size_1_peak_at)^2 + (Size_2 - Size_2_peak_at)^2)) %>% - arrange(distance) + dplyr::filter(in_order) %>% ## just look at solutions with clusters in decending sizes + dplyr::mutate(distance = sqrt((Size_1 - Size_1_peak_at)^2 + (Size_2 - Size_2_peak_at)^2)) %>% + dplyr::arrange(distance) return(list(best = cli_best, sizes = cli_sizes, diff --git a/R/fc_reorder.R b/R/fc_reorder.R new file mode 100644 index 0000000..0f8a23c --- /dev/null +++ b/R/fc_reorder.R @@ -0,0 +1,27 @@ +#' Reorder clusters in a kcca object. +#' +#' Since running kcca with different seeds will result, at least, in equivalent +#' clusters having different cluster sequence numbers which makes interpretation +#' of repeated runs difficult. +#' +#' \code{fc_reorder} simply rearranges the clusters within the kcca object according +#' to the requested method. +#' +#' @param x A kcca object. +#' @param orderby A string. Specifying the method to order by. Currently only "decending size". +#' @return The kcca object with clusters reordered. +#' @examples +#' \dontrun{ +#' fc_reorder(kcca(x, k, save.data = TRUE, control = fc_cont, family = kccaFamily(fc_family))) +#' } +fc_reorder <- function(x, orderby = "decending size") { + ko <- x + cl_map <- order(ko@clusinfo$size, decreasing = TRUE) + ko@second <- cl_map[ko@second] + ko@clsim <- ko@clsim[cl_map, cl_map] + ko@centers <- ko@centers[cl_map, ] + ko@cluster <- cl_map[ko@cluster] + ko@clusinfo <- ko@clusinfo[cl_map, ] + # ko@reorder <- cl_map add slot with reorder mapping + return(ko) +} diff --git a/R/fc_stable.R b/R/fc_stable.R index 2f04f63..4d10822 100644 --- a/R/fc_stable.R +++ b/R/fc_stable.R @@ -5,36 +5,36 @@ #' #' Repeat flexclust runs #' @return tbl_df of k * nrep rows with cluster summary for k, seed, cluster # -fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plotme=TRUE){ +fc_rclust <- function(x, k, fc_cont, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plotme=TRUE){ fc_seed = seed fc_tries <- NULL for (itry in 1:nrep) { fc_seed <- fc_seed + 1 set.seed(fc_seed) cli <- flexclust::kcca(x, k, save.data = TRUE, - control = fc_cont, family = kccaFamily(fc_family)) + control = fc_cont, family = kccaFamily(fc_family)) cli_info <- cli@clusinfo %>% - mutate(clust_num = row_number(), - clust_rank = min_rank(desc(size))) %>% - arrange(clust_rank) %>% + dplyr::mutate(clust_num = row_number(), + clust_rank = min_rank(desc(size))) %>% + dplyr::arrange(clust_rank) %>% dplyr::select(c(6, 5, 1:4)) cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed), cli_info) cli_trys <- rbind(cli_trys, cli_try) } - cli_trys <- as.tbl(cli_trys) + cli_trys <- dplyr::as.tbl(cli_trys) cli_sizes <- cli_trys %>% dplyr::select(k, seed, clust_num, clust_rank, size) %>% - filter(clust_rank <= 2) %>% - mutate(clust_label = paste0("Size_", clust_rank), - in_order = clust_num == clust_rank) %>% + dplyr::filter(clust_rank <= 2) %>% + dplyr::mutate(clust_label = paste0("Size_", clust_rank), + in_order = clust_num == clust_rank) %>% dplyr::select(-clust_rank, -clust_num) %>% - spread(key = clust_label, value = size) %>% - group_by(k, seed) %>% - summarize(in_order = all(in_order), - Size_1 = min(Size_1, na.rm = TRUE), - Size_2 = min(Size_2, na.rm = TRUE)) + tidyr::spread(key = clust_label, value = size) %>% + dplyr::group_by(k, seed) %>% + dplyr::summarize(in_order = all(in_order), + Size_1 = min(Size_1, na.rm = TRUE), + Size_2 = min(Size_2, na.rm = TRUE)) # get location of peak numerically with MASS:kde2d s2d <- with(cli_sizes, MASS::kde2d(Size_1, Size_2, n = 100)) @@ -46,14 +46,14 @@ fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plot xend <- Size_1_peak_at + 100 yend <- Size_2_peak_at + 100 p <- ggplot2::ggplot(cli_sizes, aes(Size_1, Size_2)) + - geom_point(alpha = 0.5, size = 2) + - stat_density2d() + - annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at, - xend = xend, yend = yend, color = "red", size = 1) + - annotate("text", xend, yend, - label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) + - ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", k, ", - # tries=", nrep)) + ggplot2::geom_point(alpha = 0.5, size = 2) + + ggplot2::stat_density2d() + + ggplot2::annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at, + xend = xend, yend = yend, color = "red", size = 1) + + ggplot2::annotate("text", xend, yend, + label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) + + ggplot2::ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", k, ", + # tries=", nrep)) print(p) } diff --git a/R/multiplotTest.R b/R/multiplotTest.R deleted file mode 100644 index 63e7e79..0000000 --- a/R/multiplotTest.R +++ /dev/null @@ -1,52 +0,0 @@ -# multiplotTest.R - -# Multiple plot function -# -# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) -# - cols: Number of columns in layout -# - layout: A matrix specifying the layout. If present, 'cols' is ignored. -# -# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), -# then plot 1 will go in the upper left, 2 will go in the upper right, and -# 3 will go all the way across the bottom. -# -multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { - library(grid) - - # Make a list from the ... arguments and plotlist - plots <- c(list(...), plotlist) - - numPlots = length(plots) - - # If layout is NULL, then use 'cols' to determine layout - if (is.null(layout)) { - # Make the panel - # ncol: Number of columns of plots - # nrow: Number of rows needed, calculated from # of cols - layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), - ncol = cols, nrow = ceiling(numPlots/cols)) - } - - if (numPlots==1) { - print(plots[[1]]) - - } else { - # Set up the page - grid.newpage() - pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) - - # Make each plot, in the correct location - for (i in 1:numPlots) { - # Get the i,j matrix positions of the regions that contain this subplot - matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) - - print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, - layout.pos.col = matchidx$col)) - } - } -} - -data(mtcars) -p1 <- ggplot(mtcars, aes(factor(cyl))) + geom_bar() -p2 <- ggplot(mtcars, aes(factor(cyl), mpg)) + geom_boxplot() -multiplot(p1, p2, cols = 2) diff --git a/R/stable1.R b/R/stable1.R deleted file mode 100644 index d9d4981..0000000 --- a/R/stable1.R +++ /dev/null @@ -1,76 +0,0 @@ -###### -# Check stability of 3-cluster solution (prototype stability functions) -## - -library(tidyr) -library(dplyr) -library(ggplot2) - -fc_seed <- 123 -num_clusters <- 3 -num_trys <- 500 -fc_cont@verbose <- 0 -cli_trys <- NULL - -# build df with cluster info for each seed -for (itry in 1:num_trys) { - fc_seed <- fc_seed + 1 - set.seed(fc_seed) - cli <- kcca(vol.mat, k = num_clusters, save.data = TRUE, - control = fc_cont, family = kccaFamily(fc_family)) - cli_info <- cli@clusinfo %>% - mutate(clust_num = row_number(), - clust_rank = min_rank(desc(size))) %>% - arrange(clust_rank) %>% - select(c(6, 5, 1:4)) - - cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed), - cli_info) - cli_trys <- rbind(cli_trys, cli_try) -} - -# set up plot of size of rank 2 x rank 1 -cli_sizes <- cli_trys %>% - dplyr::select(k, seed, clust_num, clust_rank, size) %>% - filter(clust_rank <= 2) %>% - mutate(clust_label = paste0("Size_", clust_rank), - in_order = clust_num == clust_rank) %>% - dplyr::select(-clust_rank, -clust_num) %>% - spread(key = clust_label, value = size) - - - -# get location of peak -# s2d <- with(cli_sizes, kde2d(Size_1, Size_2, n = 100)) -s2d_peak <- which(s2d$z == max(s2d$z)) -Size_1_peak_at <- round(s2d$x[s2d_peak %% 100], 1) -xend <- Size_1_peak_at + 100 -Size_2_peak_at <- round(s2d$y[s2d_peak %/% 100], 1) -yend <- Size_2_peak_at + 100 - -ggplot(cli_sizes, aes(Size_1, Size_2)) + - geom_point(alpha = 0.5, size = 2) + - stat_density2d() + - annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at, - xend = xend, yend = yend, color = "red", size = 1) + - annotate("text", xend, yend, label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) + - ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", num_clusters, ", # tries=", num_trys)) - -## reading off of plot, pick seeds near one of three maxima with cluster rank = order -# Top: @(1075, 255): 169 -# 2nd: @(940, 385): 215 -# 3rd: @(1250, 90): 129 - -# find closest row in cli_trys (with rank == num) - -cli_best <- cli_sizes %>% - filter(c1 == 1 & c2 == 2) %>% ## just look at solutions with clusters in decending sizes - mutate(distance = sqrt((Size_1 - Size_1_peak_at)^2 + (Size_2 - Size_2_peak_at)^2)) %>% - dplyr::select(-starts_with("c")) %>% - arrange(distance) - -cli_best - - - - diff --git a/vignettes/Cluster-Volunteers-Data-Set.Rmd b/vignettes/Cluster-Volunteers-Data-Set.Rmd new file mode 100644 index 0000000..f99c02b --- /dev/null +++ b/vignettes/Cluster-Volunteers-Data-Set.Rmd @@ -0,0 +1,58 @@ +--- +title: "Vignette Title" +author: "Vignette Author" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +Vignettes are long form documentation commonly included in packages. Because they are part of the distribution of the package, they need to be as compact as possible. The `html_vignette` output type provides a custom style sheet (and tweaks some options) to ensure that the resulting html is as small as possible. The `html_vignette` format: + +- Never uses retina figures +- Has a smaller default figure size +- Uses a custom CSS stylesheet instead of the default Twitter Bootstrap style + +## Vignette Info + +Note the various macros within the `vignette` setion of the metadata block above. These are required in order to instruct R how to build the vignette. Note that you should change the `title` field and the `\VignetteIndexEntry` to match the title of your vignette. + +## Styles + +The `html_vignette` template includes a basic CSS theme. To override this theme you can specify your own CSS in the document metadata as follows: + + output: + rmarkdown::html_vignette: + css: mystyles.css + +## Figures + +The figure sizes have been customised so that you can easily put two images side-by-side. + +```{r, fig.show='hold'} +plot(1:10) +plot(10:1) +``` + +You can enable figure captions by `fig_caption: yes` in YAML: + + output: + rmarkdown::html_vignette: + fig_caption: yes + +Then you can use the chunk option `fig.cap = "Your figure caption."` in **knitr**. + +## More Examples + +You can write math expressions, e.g. $Y = X\beta + \epsilon$, footnotes^[A footnote here.], and tables, e.g. using `knitr::kable()`. + +```{r, echo=FALSE, results='asis'} +knitr::kable(head(mtcars, 10)) +``` + +Also a quote using `>`: + +> "He who gives up [code] safety for [code] speed deserves neither." +([via](https://twitter.com/hadleywickham/status/504368538874703872))