Skip to content

Commit

Permalink
set up packabe build
Browse files Browse the repository at this point in the history
  • Loading branch information
ds4ci committed Jun 15, 2015
1 parent 6fd2d2c commit 62f594c
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 25 deletions.
9 changes: 9 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,12 @@
^\.Rproj\.user$
^README\.Rmd$
^README-.*\.png$
^Notes$
^DataIn$
^DataOut$
^Results$
Flexclust\_cache$
Flexclust\_files$
^.*\.Rmd$
^.*\.docx$
^.*\.html$
16 changes: 11 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
Package: CustSegs
Type: Package
Title: What the Package Does (Title Case)
Title: Tools for Customer Segmentation
Version: 0.1
Date: 2015-06-13
Author: Who wrote it
Maintainer: Who to complain to <[email protected]>
Author: Jim Porzak
Maintainer: Jim Porzak <[email protected]>
Description: More about what it does (maybe more than one line)
License: What license is it under?
LazyData: TRUE
License: GPL-2
LazyData: TRUE
Imports:
dplyr,
flexclust,
ggplot2,
MASS,
tidyr
23 changes: 13 additions & 10 deletions R/VolunteersCustomerPreferenceSegmentsWithFlexclust.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ Centroids of each cluster (segment) are the numbered circles. The color indicate
#### Segment profile plot - the primary tool for interpreting the solution as customer segments or persona.

``` {r fig.width = 8, fig.height = 6}
barchart(vol.cl, strip.prefix = "#", shade = TRUE, layout = c(3, 1),
barchart(vol.cl, strip.prefix = "#", shade = TRUE, layout = c(vol.cl@k, 1),
main = paste0(main_text1, " - Segment Profile Plot"))
```

Expand Down Expand Up @@ -165,7 +165,7 @@ Keeping it very simple, let's just look at the scatter plot of the number of mem
The plan is to will run kcca() 500 times, incrementing the seed with each run to get data for the plot.
First we build up a data.frame capturing @clusinfo for each run, where the run is identified by the values or k and the seed.

```{r echo=TRUE, cache = FALSE}
```{r echo=TRUE, cache = TRUE}
fc_seed <- 123
num_clusters <- 3
num_trys <- 500
Expand Down Expand Up @@ -200,12 +200,12 @@ We need to massage cli_trys so it is suitable for plotting. At the same time we
cli_sizes <- cli_trys %>%
dplyr::select(k, seed, clust_num, clust_rank, size) %>%
filter(clust_rank <= 2) %>%
mutate(clust_label = paste0("Size_", clust_rank)) %>%
dplyr::select(-clust_rank) %>%
spread(key = clust_label, value = size) %>%
group_by(k, seed) %>%
summarize(c1 = first(clust_num),
c2 = last(clust_num),
mutate(clust_label = paste0("Size_", clust_rank),
in_order = clust_num == clust_rank) %>%
dplyr::select(-clust_rank, -clust_num) %>%
spread(key = clust_label, value = size) %>%
group_by(k, seed) %>%
summarize(in_order = all(in_order),
Size_1 = min(Size_1, na.rm = TRUE),
Size_2 = min(Size_2, na.rm = TRUE))
Expand Down Expand Up @@ -234,12 +234,15 @@ Now we just need the distance of each solution's first & second cluster counts t

``` {r}
cli_best <- cli_sizes %>%
filter(c1 == 1 & c2 == 2) %>% ## just look at solutions with clusters in decending sizes
filter(in_order) %>% ## just look at solutions with clusters in decending sizes
mutate(distance = sqrt((Size_1 - Size_1_peak_at)^2 + (Size_2 - Size_2_peak_at)^2)) %>%
dplyr::select(-starts_with("c")) %>%
arrange(distance)
cli_best
```

### Clusters for each k = 2, 3, 4, ...

``` {r echo=FALSE, fig.width = 8, fig.height = 6}
for(k in 2:10) fc_rclust(vol.mat, k=k)
```

63 changes: 63 additions & 0 deletions R/fc_stable.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#' Helpers for Checking Stability of Clusters for a Given k


#' Random Clusters from flexclust
#'
#' Repeat flexclust runs
#' @return tbl_df of k * nrep rows with cluster summary for k, seed, cluster #
fc_rclust <- function(x, k, nrep=100, verbose=FALSE, FUN = kcca, seed=1234, plotme=TRUE){
fc_seed = seed
fc_tries <- NULL
for (itry in 1:nrep) {
fc_seed <- fc_seed + 1
set.seed(fc_seed)
cli <- flexclust::kcca(x, k, save.data = TRUE,
control = fc_cont, family = kccaFamily(fc_family))
cli_info <- cli@clusinfo %>%
mutate(clust_num = row_number(),
clust_rank = min_rank(desc(size))) %>%
arrange(clust_rank) %>%
dplyr::select(c(6, 5, 1:4))
cli_try <- cbind(data.frame(k = num_clusters, seed = fc_seed),
cli_info)
cli_trys <- rbind(cli_trys, cli_try)
}
cli_trys <- as.tbl(cli_trys)

cli_sizes <- cli_trys %>%
dplyr::select(k, seed, clust_num, clust_rank, size) %>%
filter(clust_rank <= 2) %>%
mutate(clust_label = paste0("Size_", clust_rank),
in_order = clust_num == clust_rank) %>%
dplyr::select(-clust_rank, -clust_num) %>%
spread(key = clust_label, value = size) %>%
group_by(k, seed) %>%
summarize(in_order = all(in_order),
Size_1 = min(Size_1, na.rm = TRUE),
Size_2 = min(Size_2, na.rm = TRUE))

# get location of peak numerically with MASS:kde2d
s2d <- with(cli_sizes, MASS::kde2d(Size_1, Size_2, n = 100))
s2d_peak <- which(s2d$z == max(s2d$z))
Size_1_peak_at <- round(s2d$x[s2d_peak %% 100], 1)
Size_2_peak_at <- round(s2d$y[s2d_peak %/% 100], 1)

if(plotme) {
xend <- Size_1_peak_at + 100
yend <- Size_2_peak_at + 100
p <- ggplot2::ggplot(cli_sizes, aes(Size_1, Size_2)) +
geom_point(alpha = 0.5, size = 2) +
stat_density2d() +
annotate("segment", x = Size_1_peak_at, y = Size_2_peak_at,
xend = xend, yend = yend, color = "red", size = 1) +
annotate("text", xend, yend,
label = paste0("(", Size_1_peak_at, ", ", Size_2_peak_at, ")"), vjust = 0) +
ggtitle(paste0("Size of Cluster 2 by Size of Cluster 1 for k=", k, ",
# tries=", nrep))
print(p)
}

return(list(cl_sizes = cli_sizes, Size_1_peak_at, Size_2_peak_at,
cl_tries = cli_trys))
}

18 changes: 8 additions & 10 deletions R/stable1.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
######
# Check stability of 3-cluster solution
# Check stability of 3-cluster solution (prototype stability functions)
##

library(tidyr)
library(dplyr)
library(ggplot2)
Expand Down Expand Up @@ -32,18 +33,15 @@ for (itry in 1:num_trys) {
cli_sizes <- cli_trys %>%
dplyr::select(k, seed, clust_num, clust_rank, size) %>%
filter(clust_rank <= 2) %>%
mutate(clust_label = paste0("Size_", clust_rank)) %>%
dplyr::select(-clust_rank) %>%
spread(key = clust_label, value = size) %>%
group_by(k, seed) %>%
summarize(c1 = first(clust_num),
c2 = last(clust_num),
Size_1 = min(Size_1, na.rm = TRUE),
Size_2 = min(Size_2, na.rm = TRUE))
mutate(clust_label = paste0("Size_", clust_rank),
in_order = clust_num == clust_rank) %>%
dplyr::select(-clust_rank, -clust_num) %>%
spread(key = clust_label, value = size)



# get location of peak
s2d <- with(cli_sizes, kde2d(Size_1, Size_2, n = 100))
# s2d <- with(cli_sizes, kde2d(Size_1, Size_2, n = 100))
s2d_peak <- which(s2d$z == max(s2d$z))
Size_1_peak_at <- round(s2d$x[s2d_peak %% 100], 1)
xend <- Size_1_peak_at + 100
Expand Down

0 comments on commit 62f594c

Please sign in to comment.