Skip to content

Commit

Permalink
after full Volunteer run
Browse files Browse the repository at this point in the history
  • Loading branch information
ds4ci committed Jun 16, 2015
1 parent 62f594c commit fdd2c26
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 8 deletions.
105 changes: 97 additions & 8 deletions R/VolunteersCustomerPreferenceSegmentsWithFlexclust.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,70 @@ library(tidyr)
library(ggplot2)
data("volunteers")
vol_ch <- volunteers[-(1:2)]
# internal functions
fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plotme=TRUE){
fc_seed = seed
cli_trys <- NULL
for (itry in 1:nrep) {
fc_seed <- fc_seed + 1
set.seed(fc_seed)
cli <- flexclust::kcca(x, k, save.data = TRUE,
control = fc_cont, family = kccaFamily(fc_family))
cli_info <- cli@clusinfo %>%
mutate(clust_num = row_number(),
clust_rank = rank(desc(size), ties.method = "first")) %>%
arrange(clust_rank) %>%
dplyr::select(c(6, 5, 1:4))
cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed),
cli_info)
cli_trys <- rbind(cli_trys, cli_try)
}
cli_trys <- as.tbl(cli_trys)
cli_sizes <- cli_trys %>%
dplyr::select(k, seed, clust_num, clust_rank, size) %>%
filter(clust_rank <= 2) %>%
mutate(clust_label = paste0("Size_", clust_rank),
in_order = clust_num == clust_rank) %>%
dplyr::select(-clust_rank, -clust_num) %>%
spread(key = clust_label, value = size) %>%
group_by(k, seed) %>%
summarize(in_order = all(in_order),
Size_1 = min(Size_1, na.rm = TRUE),
Size_2 = min(Size_2, na.rm = TRUE))
# get location of peak numerically with MASS:kde2d
s2d <- with(cli_sizes, MASS::kde2d(Size_1, Size_2, n = 100))
s2d_peak <- which(s2d$z == max(s2d$z))
Size_1_peak_at <- round(s2d$x[s2d_peak %% 100], 1)
Size_2_peak_at <- round(s2d$y[s2d_peak %/% 100], 1)
if(plotme) {
xend <- Size_1_peak_at + 100
yend <- Size_2_peak_at + 100
p <- ggplot2::ggplot(cli_sizes, aes(Size_1, Size_2)) +
geom_point(alpha = 0.5, size = 2) +
stat_density2d() +
annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at,
xend = xend, yend = yend, color = "red", size = 1) +
annotate("text", xend, yend,
label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) +
ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", k, ", # tries=", nrep))
print(p)
}
cli_best <- cli_sizes %>%
filter(in_order) %>% ## just look at solutions with top 2 clusters in decending sizes
## until we figure out how to re-arrange clusters in kcca object
mutate(distance = sqrt((Size_1 - Size_1_peak_at)^2 + (Size_2 - Size_2_peak_at)^2)) %>%
arrange(distance)
return(list(best = cli_best,
sizes = cli_sizes,
peak_at = c(Size_1_peak_at, Size_2_peak_at),
tries = cli_trys))
}
```

### Introduction
Expand Down Expand Up @@ -117,7 +181,7 @@ Next we will look at two important practical issues:
2. How many clusters best describe the respondents?

### Stability of the solutions for any k

#### The Stability Problem
If we run kcca() on the same data but with different starting seeds for any real customer dataset (in which clusters are typically somewhat fuzzy), we expect to get different solutions. For example comparing the above solution with two other solutions, we see the results are somewhat different:

```{r echo = FALSE, fig.width = 5, fig.height = 5}
Expand Down Expand Up @@ -156,14 +220,15 @@ barchart(vol.cl, strip.prefix = "#", shade = TRUE, layout = c(3, 1),
These three examples are similar in that there a single predominate cluster but looking closer there are
important differences in how each solution is interpreted.

#### One Simple Solution

We need a method to explore many solutions and look for a frequent stable solution. The challenge is coming up with a measure of solution similarty. kcca() returns an object with 17 slots, so there are a lot of metrics we could devise.

Keeping it very simple, let's just look at the scatter plot of the number of members in the two largest clusters. This is easy to get from slot @clusinfo:

`r str(vol.cl@clusinfo)`

The plan is to will run kcca() 500 times, incrementing the seed with each run to get data for the plot.
First we build up a data.frame capturing @clusinfo for each run, where the run is identified by the values or k and the seed.
The plan is to will run kcca() a few hundred times, incrementing the seed with each run to get data for the plot. First we build up a data.frame capturing @clusinfo for each run, where the run is identified by the values or k and the seed.

```{r echo=TRUE, cache = TRUE}
fc_seed <- 123
Expand All @@ -180,7 +245,7 @@ for (itry in 1:num_trys) {
control = fc_cont, family = kccaFamily(fc_family))
cli_info <- cli@clusinfo %>%
mutate(clust_num = row_number(),
clust_rank = min_rank(desc(size))) %>%
clust_rank = rank(desc(size), ties.method = "first")) %>%
arrange(clust_rank) %>%
dplyr::select(c(6, 5, 1:4))
cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed),
Expand Down Expand Up @@ -240,9 +305,33 @@ cli_best <- cli_sizes %>%
cli_best
```

### Clusters for each k = 2, 3, 4, ...

``` {r echo=FALSE, fig.width = 8, fig.height = 6}
for(k in 2:10) fc_rclust(vol.mat, k=k)
Taking the value pair (k = `r cli_best[1, 1]`, seed = `r cli_best[1, 2]`) from solution closest to the peak (the top row), gives us the parameters we need to re-generate the plots for the "stable clustering" for a given k. We apply this method in "production" mode next.


### Stable Clusters for k = 2, 3, 4, ...

``` {r echo=FALSE, fig.width = 8, fig.height = 6, cache=TRUE}
for(k in 2:15) {
cat("k =", k)
num_clusters <- k
cli <- fc_rclust(vol.mat, k=num_clusters, nrep=200)
cli
head(cli$best, 3)
fc_seed <- as.integer(cli$best[1, 2])
x <- try(set.seed(fc_seed))
if(!is.null(x)) {
cat("set.seed error, invalid cli$best. Skipping k = ", k)
next()
}
vol.cl <- kcca(vol.mat, k = num_clusters, save.data = TRUE,
control = fc_cont, family = kccaFamily(fc_family))
vol.pca <- prcomp(vol.mat) ## plot on first two principal components
plot(vol.cl, data = vol.mat, project = vol.pca,
main = paste0("Segment Seperation Plot, k=", num_clusters, ", seed=", fc_seed))
bc <- barchart(vol.cl, strip.prefix = "#", shade = TRUE, layout = c(num_clusters, 1),
main = paste0("Segment Profile Plot, k=", num_clusters, ", seed=", fc_seed))
print(bc)
}
```


61 changes: 61 additions & 0 deletions R/fc_rclust.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plotme=TRUE){
fc_seed = seed
fc_tries <- NULL
for (itry in 1:nrep) {
fc_seed <- fc_seed + 1
set.seed(fc_seed)
cli <- flexclust::kcca(x, k, save.data = TRUE,
control = fc_cont, family = kccaFamily(fc_family))
cli_info <- cli@clusinfo %>%
mutate(clust_num = row_number(),
clust_rank = min_rank(desc(size))) %>%
arrange(clust_rank) %>%
dplyr::select(c(6, 5, 1:4))
cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed),
cli_info)
cli_trys <- rbind(cli_trys, cli_try)
}
cli_trys <- as.tbl(cli_trys)

cli_sizes <- cli_trys %>%
dplyr::select(k, seed, clust_num, clust_rank, size) %>%
filter(clust_rank <= 2) %>%
mutate(clust_label = paste0("Size_", clust_rank),
in_order = clust_num == clust_rank) %>%
dplyr::select(-clust_rank, -clust_num) %>%
spread(key = clust_label, value = size) %>%
group_by(k, seed) %>%
summarize(in_order = all(in_order),
Size_1 = min(Size_1, na.rm = TRUE),
Size_2 = min(Size_2, na.rm = TRUE))

# get location of peak numerically with MASS:kde2d
s2d <- with(cli_sizes, MASS::kde2d(Size_1, Size_2, n = 100))
s2d_peak <- which(s2d$z == max(s2d$z))
Size_1_peak_at <- round(s2d$x[s2d_peak %% 100], 1)
Size_2_peak_at <- round(s2d$y[s2d_peak %/% 100], 1)

if(plotme) {
xend <- Size_1_peak_at + 100
yend <- Size_2_peak_at + 100
p <- ggplot2::ggplot(cli_sizes, aes(Size_1, Size_2)) +
geom_point(alpha = 0.5, size = 2) +
stat_density2d() +
annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at,
xend = xend, yend = yend, color = "red", size = 1) +
annotate("text", xend, yend,
label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) +
ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", k, ", # tries=", nrep))
print(p)
}

cli_best <- cli_sizes %>%
filter(in_order) %>% ## just look at solutions with clusters in decending sizes
mutate(distance = sqrt((Size_1 - Size_1_peak_at)^2 + (Size_2 - Size_2_peak_at)^2)) %>%
arrange(distance)

return(list(best = cli_best,
sizes = cli_sizes,
peak_at = c(Size_1_peak_at, Size_2_peak_at),
tries = cli_trys))
}

0 comments on commit fdd2c26

Please sign in to comment.