after full Volunteer run

ds4ci · Jun 16, 2015 · fdd2c26 · fdd2c26
1 parent 62f594c
commit fdd2c26
Show file tree

Hide file tree

Showing 2 changed files with 158 additions and 8 deletions.
diff --git a/R/VolunteersCustomerPreferenceSegmentsWithFlexclust.Rmd b/R/VolunteersCustomerPreferenceSegmentsWithFlexclust.Rmd
@@ -15,6 +15,70 @@ library(tidyr)
 library(ggplot2)
 data("volunteers")
 vol_ch <- volunteers[-(1:2)]
+
+# internal functions
+fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plotme=TRUE){
+  fc_seed = seed
+  cli_trys <- NULL
+  for (itry in 1:nrep) {
+    fc_seed <- fc_seed + 1
+    set.seed(fc_seed)
+    cli <- flexclust::kcca(x, k, save.data = TRUE,
+                control = fc_cont, family = kccaFamily(fc_family))
+    cli_info <- cli@clusinfo %>%
+      mutate(clust_num = row_number(),
+             clust_rank = rank(desc(size), ties.method = "first")) %>%
+      arrange(clust_rank) %>%
+      dplyr::select(c(6, 5, 1:4))
+    cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed),
+                     cli_info)
+    cli_trys <- rbind(cli_trys, cli_try)
+  }
+  cli_trys <- as.tbl(cli_trys)
+
+  cli_sizes <- cli_trys %>%
+    dplyr::select(k, seed, clust_num, clust_rank, size) %>%
+    filter(clust_rank <= 2) %>%
+    mutate(clust_label = paste0("Size_", clust_rank),
+           in_order = clust_num == clust_rank) %>%
+    dplyr::select(-clust_rank, -clust_num) %>%
+    spread(key = clust_label, value = size) %>%
+    group_by(k, seed) %>%
+    summarize(in_order = all(in_order),
+              Size_1 = min(Size_1, na.rm = TRUE),
+              Size_2 = min(Size_2, na.rm = TRUE))
+
+  # get location of peak numerically with MASS:kde2d
+  s2d <- with(cli_sizes, MASS::kde2d(Size_1, Size_2, n = 100))
+  s2d_peak <- which(s2d$z == max(s2d$z))
+  Size_1_peak_at <- round(s2d$x[s2d_peak %% 100], 1)
+  Size_2_peak_at <- round(s2d$y[s2d_peak %/% 100], 1)
+
+  if(plotme) {
+    xend <- Size_1_peak_at + 100
+    yend <- Size_2_peak_at + 100
+    p <- ggplot2::ggplot(cli_sizes, aes(Size_1, Size_2)) +
+           geom_point(alpha = 0.5, size = 2) +
+           stat_density2d() +
+           annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at,
+                    xend = xend, yend = yend, color = "red", size = 1) +
+           annotate("text", xend, yend,
+                    label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) +
+           ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", k, ", # tries=", nrep))
+    print(p)
+  }
+  
+  cli_best <- cli_sizes %>%
+    filter(in_order) %>%    ## just look at solutions with top 2 clusters in decending sizes
+                            ## until we figure out how to re-arrange clusters in kcca object
+    mutate(distance = sqrt((Size_1 - Size_1_peak_at)^2 + (Size_2 - Size_2_peak_at)^2)) %>%
+    arrange(distance)
+  
+  return(list(best = cli_best,
+              sizes = cli_sizes, 
+              peak_at = c(Size_1_peak_at, Size_2_peak_at),
+              tries = cli_trys))
+}
 ```
 
 ### Introduction 
@@ -117,7 +181,7 @@ Next we will look at two important practical issues:
 2. How many clusters best describe the respondents?
 
 ### Stability of the solutions for any k
-
+#### The Stability Problem
 If we run kcca() on the same data but with different starting seeds for any real customer dataset (in which clusters are typically somewhat fuzzy), we expect to get different solutions. For example comparing the above solution with two other solutions, we see the results are somewhat different:
 
 ```{r echo = FALSE, fig.width = 5, fig.height = 5}
@@ -156,14 +220,15 @@ barchart(vol.cl, strip.prefix = "#", shade = TRUE,  layout = c(3, 1),
 These three examples are similar in that there a single predominate cluster but looking closer there are
 important differences in how each solution is interpreted.
 
+#### One Simple Solution
+
 We need a method to explore many solutions and look for a frequent stable solution. The challenge is coming up with a measure of solution similarty. kcca() returns an object with 17 slots, so there are a lot of metrics we could devise.
 
 Keeping it very simple, let's just look at the scatter plot of the number of members in the two largest clusters. This is easy to get from slot @clusinfo:
 
 `r str(vol.cl@clusinfo)`
 
-The plan is to will run kcca() 500 times, incrementing the seed with each run to get data for the plot. 
-First we build up a data.frame capturing @clusinfo for each run, where the run is identified by the values or k and the seed.
+The plan is to will run kcca() a few hundred times, incrementing the seed with each run to get data for the plot. First we build up a data.frame capturing @clusinfo for each run, where the run is identified by the values or k and the seed.
 
 ```{r echo=TRUE, cache = TRUE}
 fc_seed <- 123
@@ -180,7 +245,7 @@ for (itry in 1:num_trys) {
               control = fc_cont, family = kccaFamily(fc_family))
   cli_info <- cli@clusinfo %>%
     mutate(clust_num = row_number(),
-           clust_rank = min_rank(desc(size))) %>%
+           clust_rank = rank(desc(size), ties.method = "first")) %>%
     arrange(clust_rank) %>%
     dplyr::select(c(6, 5, 1:4))
   cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed),
@@ -240,9 +305,33 @@ cli_best <- cli_sizes %>%
 cli_best
 ```
 
-### Clusters for each k = 2, 3, 4, ...
-
-``` {r echo=FALSE, fig.width = 8, fig.height = 6}
-for(k in 2:10) fc_rclust(vol.mat, k=k)
+Taking the value pair (k = `r cli_best[1, 1]`, seed = `r cli_best[1, 2]`) from solution closest to the peak (the top row), gives us the parameters we need to re-generate the plots for the "stable clustering" for a given k. We apply this method in "production" mode next.
+
+
+### Stable Clusters for k = 2, 3, 4, ...
+
+``` {r echo=FALSE, fig.width = 8, fig.height = 6, cache=TRUE}
+for(k in 2:15) {
+  cat("k =", k)
+  num_clusters <- k
+  cli <- fc_rclust(vol.mat, k=num_clusters, nrep=200)
+  cli
+  head(cli$best, 3)
+  fc_seed <- as.integer(cli$best[1, 2])
+  x <- try(set.seed(fc_seed))
+  if(!is.null(x)) {
+    cat("set.seed error, invalid cli$best. Skipping k = ", k)
+    next()
+  }
+  vol.cl <- kcca(vol.mat, k = num_clusters, save.data = TRUE,
+                 control = fc_cont, family = kccaFamily(fc_family))
+  vol.pca <- prcomp(vol.mat)      ## plot on first two principal components
+  plot(vol.cl, data = vol.mat, project = vol.pca,
+             main = paste0("Segment Seperation Plot, k=", num_clusters, ", seed=", fc_seed))
+  bc <- barchart(vol.cl, strip.prefix = "#", shade = TRUE,  layout = c(num_clusters, 1),
+                 main = paste0("Segment Profile Plot, k=", num_clusters, ", seed=", fc_seed))
+  print(bc)
+}
 ```
 
+
diff --git a/R/fc_rclust.R b/R/fc_rclust.R
@@ -0,0 +1,61 @@
+fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plotme=TRUE){
+  fc_seed = seed
+  fc_tries <- NULL
+  for (itry in 1:nrep) {
+    fc_seed <- fc_seed + 1
+    set.seed(fc_seed)
+    cli <- flexclust::kcca(x, k, save.data = TRUE,
+                           control = fc_cont, family = kccaFamily(fc_family))
+    cli_info <- cli@clusinfo %>%
+      mutate(clust_num = row_number(),
+             clust_rank = min_rank(desc(size))) %>%
+      arrange(clust_rank) %>%
+      dplyr::select(c(6, 5, 1:4))
+    cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed),
+                     cli_info)
+    cli_trys <- rbind(cli_trys, cli_try)
+  }
+  cli_trys <- as.tbl(cli_trys)
+
+  cli_sizes <- cli_trys %>%
+    dplyr::select(k, seed, clust_num, clust_rank, size) %>%
+    filter(clust_rank <= 2) %>%
+    mutate(clust_label = paste0("Size_", clust_rank),
+           in_order = clust_num == clust_rank) %>%
+    dplyr::select(-clust_rank, -clust_num) %>%
+    spread(key = clust_label, value = size) %>%
+    group_by(k, seed) %>%
+    summarize(in_order = all(in_order),
+              Size_1 = min(Size_1, na.rm = TRUE),
+              Size_2 = min(Size_2, na.rm = TRUE))
+
+  # get location of peak numerically with MASS:kde2d
+  s2d <- with(cli_sizes, MASS::kde2d(Size_1, Size_2, n = 100))
+  s2d_peak <- which(s2d$z == max(s2d$z))
+  Size_1_peak_at <- round(s2d$x[s2d_peak %% 100], 1)
+  Size_2_peak_at <- round(s2d$y[s2d_peak %/% 100], 1)
+
+  if(plotme) {
+    xend <- Size_1_peak_at + 100
+    yend <- Size_2_peak_at + 100
+    p <- ggplot2::ggplot(cli_sizes, aes(Size_1, Size_2)) +
+      geom_point(alpha = 0.5, size = 2) +
+      stat_density2d() +
+      annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at,
+               xend = xend, yend = yend, color = "red", size = 1) +
+      annotate("text", xend, yend,
+               label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) +
+      ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", k, ", # tries=", nrep))
+    print(p)
+  }
+
+  cli_best <- cli_sizes %>%
+    filter(in_order) %>%    ## just look at solutions with clusters in decending sizes
+    mutate(distance = sqrt((Size_1 - Size_1_peak_at)^2 + (Size_2 - Size_2_peak_at)^2)) %>%
+    arrange(distance)
+
+  return(list(best = cli_best,
+              sizes = cli_sizes,
+              peak_at = c(Size_1_peak_at, Size_2_peak_at),
+              tries = cli_trys))
+}