set up packabe build

ds4ci · Jun 15, 2015 · 62f594c · 62f594c
1 parent 6fd2d2c
commit 62f594c
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 25 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -2,3 +2,12 @@
 ^\.Rproj\.user$
 ^README\.Rmd$
 ^README-.*\.png$
+^Notes$
+^DataIn$
+^DataOut$
+^Results$
+Flexclust\_cache$
+Flexclust\_files$
+^.*\.Rmd$
+^.*\.docx$
+^.*\.html$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,16 @@
 Package: CustSegs
 Type: Package
-Title: What the Package Does (Title Case)
+Title: Tools for Customer Segmentation
 Version: 0.1
 Date: 2015-06-13
-Author: Who wrote it
-Maintainer: Who to complain to <[email protected]>
+Author: Jim Porzak
+Maintainer: Jim Porzak <[email protected]>
 Description: More about what it does (maybe more than one line)
-License: What license is it under?
-LazyData: TRUE
+License: GPL-2
+LazyData: TRUE
+Imports:
+  dplyr,
+  flexclust,
+  ggplot2,
+  MASS,
+  tidyr
diff --git a/R/VolunteersCustomerPreferenceSegmentsWithFlexclust.Rmd b/R/VolunteersCustomerPreferenceSegmentsWithFlexclust.Rmd
@@ -100,7 +100,7 @@ Centroids of each cluster (segment) are the numbered circles. The color indicate
 #### Segment profile plot - the primary tool for interpreting the solution as customer segments or persona.
 
 ``` {r fig.width = 8, fig.height = 6}
-barchart(vol.cl, strip.prefix = "#", shade = TRUE, layout = c(3, 1),
+barchart(vol.cl, strip.prefix = "#", shade = TRUE, layout = c(vol.cl@k, 1),
          main = paste0(main_text1, " - Segment Profile Plot"))
 ```
 
@@ -165,7 +165,7 @@ Keeping it very simple, let's just look at the scatter plot of the number of mem
 The plan is to will run kcca() 500 times, incrementing the seed with each run to get data for the plot. 
 First we build up a data.frame capturing @clusinfo for each run, where the run is identified by the values or k and the seed.
 
-```{r echo=TRUE, cache = FALSE}
+```{r echo=TRUE, cache = TRUE}
 fc_seed <- 123
 num_clusters <- 3
 num_trys <- 500
@@ -200,12 +200,12 @@ We need to massage cli_trys so it is suitable for plotting. At the same time we
 cli_sizes <- cli_trys %>%
   dplyr::select(k, seed, clust_num, clust_rank, size) %>%
   filter(clust_rank <= 2) %>%
-  mutate(clust_label = paste0("Size_", clust_rank)) %>%
-  dplyr::select(-clust_rank) %>%
-  spread(key = clust_label, value = size) %>%
-  group_by(k, seed) %>%
-  summarize(c1 = first(clust_num),
-            c2 = last(clust_num),
+  mutate(clust_label = paste0("Size_", clust_rank),
+         in_order = clust_num == clust_rank) %>%
+  dplyr::select(-clust_rank, -clust_num) %>%
+  spread(key = clust_label, value = size) %>% 
+  group_by(k, seed) %>% 
+  summarize(in_order = all(in_order),
             Size_1 = min(Size_1, na.rm = TRUE),
             Size_2 = min(Size_2, na.rm = TRUE))
 
@@ -234,12 +234,15 @@ Now we just need the distance of each solution's first & second cluster counts t
 
 ``` {r}
 cli_best <- cli_sizes %>%
-  filter(c1 == 1 & c2 == 2) %>%    ## just look at solutions with clusters in decending sizes
+  filter(in_order) %>%    ## just look at solutions with clusters in decending sizes
   mutate(distance = sqrt((Size_1 - Size_1_peak_at)^2 + (Size_2 - Size_2_peak_at)^2)) %>%
-  dplyr::select(-starts_with("c")) %>% 
   arrange(distance)
 cli_best
 ```
 
+### Clusters for each k = 2, 3, 4, ...
 
+``` {r echo=FALSE, fig.width = 8, fig.height = 6}
+for(k in 2:10) fc_rclust(vol.mat, k=k)
+```
 
diff --git a/R/fc_stable.R b/R/fc_stable.R
@@ -0,0 +1,63 @@
+#' Helpers for Checking Stability of Clusters for a Given k
+
+
+#' Random Clusters from flexclust
+#'
+#' Repeat flexclust runs
+#' @return tbl_df of k * nrep rows with cluster summary for k, seed, cluster #
+fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plotme=TRUE){
+  fc_seed = seed
+  fc_tries <- NULL
+  for (itry in 1:nrep) {
+    fc_seed <- fc_seed + 1
+    set.seed(fc_seed)
+    cli <- flexclust::kcca(x, k, save.data = TRUE,
+                control = fc_cont, family = kccaFamily(fc_family))
+    cli_info <- cli@clusinfo %>%
+      mutate(clust_num = row_number(),
+             clust_rank = min_rank(desc(size))) %>%
+      arrange(clust_rank) %>%
+      dplyr::select(c(6, 5, 1:4))
+    cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed),
+                     cli_info)
+    cli_trys <- rbind(cli_trys, cli_try)
+  }
+  cli_trys <- as.tbl(cli_trys)
+
+  cli_sizes <- cli_trys %>%
+    dplyr::select(k, seed, clust_num, clust_rank, size) %>%
+    filter(clust_rank <= 2) %>%
+    mutate(clust_label = paste0("Size_", clust_rank),
+           in_order = clust_num == clust_rank) %>%
+    dplyr::select(-clust_rank, -clust_num) %>%
+    spread(key = clust_label, value = size) %>%
+    group_by(k, seed) %>%
+    summarize(in_order = all(in_order),
+              Size_1 = min(Size_1, na.rm = TRUE),
+              Size_2 = min(Size_2, na.rm = TRUE))
+
+  # get location of peak numerically with MASS:kde2d
+  s2d <- with(cli_sizes, MASS::kde2d(Size_1, Size_2, n = 100))
+  s2d_peak <- which(s2d$z == max(s2d$z))
+  Size_1_peak_at <- round(s2d$x[s2d_peak %% 100], 1)
+  Size_2_peak_at <- round(s2d$y[s2d_peak %/% 100], 1)
+
+  if(plotme) {
+    xend <- Size_1_peak_at + 100
+    yend <- Size_2_peak_at + 100
+    p <- ggplot2::ggplot(cli_sizes, aes(Size_1, Size_2)) +
+           geom_point(alpha = 0.5, size = 2) +
+           stat_density2d() +
+           annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at,
+                    xend = xend, yend = yend, color = "red", size = 1) +
+           annotate("text", xend, yend,
+                    label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) +
+           ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", k, ",
+                          # tries=", nrep))
+    print(p)
+  }
+
+  return(list(cl_sizes = cli_sizes, Size_1_peak_at, Size_2_peak_at,
+              cl_tries = cli_trys))
+}
+
diff --git a/R/stable1.R b/R/stable1.R
@@ -1,6 +1,7 @@
 ######
-# Check stability of 3-cluster solution
+# Check stability of 3-cluster solution (prototype stability functions)
 ##
+
 library(tidyr)
 library(dplyr)
 library(ggplot2)
@@ -32,18 +33,15 @@ for (itry in 1:num_trys) {
 cli_sizes <- cli_trys %>%
   dplyr::select(k, seed, clust_num, clust_rank, size) %>%
   filter(clust_rank <= 2) %>%
-  mutate(clust_label = paste0("Size_", clust_rank)) %>%
-  dplyr::select(-clust_rank) %>%
-  spread(key = clust_label, value = size) %>%
-  group_by(k, seed) %>%
-  summarize(c1 = first(clust_num),
-            c2 = last(clust_num),
-            Size_1 = min(Size_1, na.rm = TRUE),
-            Size_2 = min(Size_2, na.rm = TRUE))
+  mutate(clust_label = paste0("Size_", clust_rank),
+         in_order = clust_num == clust_rank) %>%
+  dplyr::select(-clust_rank, -clust_num) %>%
+  spread(key = clust_label, value = size)
+
 
 
 # get location of peak
-s2d <- with(cli_sizes, kde2d(Size_1, Size_2, n = 100))
+# s2d <- with(cli_sizes, kde2d(Size_1, Size_2, n = 100))
 s2d_peak <- which(s2d$z == max(s2d$z))
 Size_1_peak_at <- round(s2d$x[s2d_peak %% 100], 1)
 xend <- Size_1_peak_at + 100