From 7d6cd0cdc8a33c97fd221451e3f9c677be2dfba2 Mon Sep 17 00:00:00 2001 From: Michael Hahsler Date: Sat, 15 May 2021 14:16:46 -0500 Subject: [PATCH] * Added new method compatible to itemMatrix to check if the item coding is compatible between two objects. * c() now produces a warning if two itemMatrices with different itemCoding are combined. * encode and recode accept now for itemLabels also objects with an itemLabels method. * recode is now also available for association (itemsets and rules). * recode: parameter match is now deprecated. * Fixed some TYPOs. * Added item hierarchy and item coding to vignette. --- DESCRIPTION | 4 +- NAMESPACE | 1 + NEWS.md | 16 +++- R/AllGenerics.R | 3 + R/itemCoding.R | 62 ++++++++++++--- R/itemMatrix.R | 66 +++++++++------- R/transactions.R | 2 +- README.md | 3 +- man/Mushroom.Rd | 2 +- man/apriori.Rd | 12 ++- man/associations-class.Rd | 2 +- man/crossTable.Rd | 4 +- man/discretize.Rd | 2 +- man/is.superset.Rd | 2 +- man/itemCoding.Rd | 119 ++++++++++++++++++++++------- man/itemMatrix-class.Rd | 85 +++++++++++++++------ man/match.Rd | 2 +- man/rules-class.Rd | 4 +- man/write.Rd | 2 +- tests/testthat/test-itemCoding.R | 43 +++++++++++ tests/testthat/test-sets.R | 11 +++ tests/testthat/test-transactions.R | 4 +- vignettes/arules.Rnw | 53 ++++++++++--- 23 files changed, 379 insertions(+), 125 deletions(-) create mode 100644 tests/testthat/test-itemCoding.R diff --git a/DESCRIPTION b/DESCRIPTION index 181efa4..e1dadc0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: arules -Version: 1.6-7 -Date: 2021-03-12 +Version: 1.6-7.1 +Date: 2021-xx-xx Title: Mining Association Rules and Frequent Itemsets Authors@R: c(person("Michael", "Hahsler", role = c("aut", "cre", "cph"), email = "mhahsler@lyle.smu.edu"), diff --git a/NAMESPACE b/NAMESPACE index 92e6a15..07a8442 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -72,6 +72,7 @@ exportMethods( "aggregate", "abbreviate", "addComplement", + "compatible", "coverage", "crossTable", "c", diff --git a/NEWS.md b/NEWS.md index d636d3a..e519773 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,14 @@ # arules 1.6-7.1 (xx/xx/2021) +## New Feature +* Added new method compatible to itemMatrix to check if the item coding is compatible + between two objects. +* c() now produces a warning if two itemMatrices with different itemCoding are combined. +* encode and recode accept now for itemLabels also objects with an itemLabels method. +* recode is now also available for association (itemsets and rules). +## Changes +* recode: parameter match is now deprecated. ## Bug Fixes * fixed addAggregate problem with character (reported by javiercoh). @@ -69,7 +77,7 @@ * discretizeDF now reports which column produces the problem. ## Changes -* transactions: numeric columns are now discretized during coersion using discretizeDF (with a warning). +* transactions: numeric columns are now discretized during coercion using discretizeDF (with a warning). ## Bug Fixes * The spurious warning for reaching maxlen in apriori is now removed (reported by Ryan J. Cole). @@ -104,7 +112,7 @@ # arules 1.5-5 (01/09/2018) ## New Features -* Added (absolut support) "count" as an interest measure. +* Added (absolute support) "count" as an interest measure. * itemLabels can now be assigned for rules and itemsets. ## Bug Fixes @@ -132,7 +140,7 @@ ## Bug Fixes * Improved PROTECT placement in C source code. -* itemMeasures for single rules/itemssets now returns a proper data.frame +* itemMeasures for single rules/itemsets now returns a proper data.frame (reported by lordbitin). * itemMeasures: Added missing parentheses in kappa calculation and fixed equation for least contradiction (reported by Feng Chen). @@ -252,7 +260,7 @@ * subset extraction: added checks, handles now NAs and recycles for logical. * read.transactions gained arguments skip and quote and some defaults for read and write (uses now quotes and no rownames by default) have changed. -* itemMatrix: coersion from matrix checks now for 0-1 matrix with a warning. +* itemMatrix: coercion from matrix checks now for 0-1 matrix with a warning. * APRIORI and ECLAT report now absolute minimum support. * APRIORI: out-of-memory while rule building does now result in an error and not a memory fault. diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 37cf3b6..3f42115 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -45,6 +45,9 @@ setGeneric("DATAFRAME", setGeneric("addComplement", function(x, labels, complementLabels=NULL) standardGeneric("addComplement")) +setGeneric("compatible", + function(x, y) standardGeneric("compatible")) + setGeneric("coverage", function(x, transactions = NULL, reuse = TRUE) standardGeneric("coverage")) diff --git a/R/itemCoding.R b/R/itemCoding.R index 2e5d028..7db633a 100644 --- a/R/itemCoding.R +++ b/R/itemCoding.R @@ -36,7 +36,9 @@ setMethod("encode", signature(x = "character"), ## regular encoding r <- which(itemLabels %in% x) if (length(r) < length(x)) - stop("Unknown item label(s) in ", deparse(x)) + warning("The following item labels are not available in itemLabels: ", + paste(setdiff(x, itemLabels), collapse = ", "), + "\nItems with missing labels are dropped!", call. = FALSE) r } ) @@ -47,20 +49,21 @@ setMethod("encode", signature(x = "numeric"), if (itemMatrix == TRUE) return(encode(list(x), itemLabels, itemMatrix == TRUE)) - ## handle empty sets if (length(x)==0) return(integer(0)) ## regular encoding r <- range(x) if (r[1] < 1 || r[2] > length(itemLabels)) - stop("Invalid range in ", deparse(x)) + stop("Invalid item ID in ", deparse(x), call. = FALSE) + + ## deal with numeric if (!is.integer(x)) { - if (!all.equal(x, (i <- as.integer(x)))) - stop("Invalid numeric values in ", deparse(x)) - i - } else - x + if (any(x %% 1 != 0)) + stop("Invalid item ID (needs to be integer) in ", deparse(x), call. = FALSE) + x <- as.integer(x) + } + x } ) @@ -68,6 +71,10 @@ setMethod("encode", signature(x = "numeric"), ## directly in internal code. setMethod("encode", signature(x = "list"), function(x, itemLabels, itemMatrix = TRUE) { + if(is(itemLabels, "itemMatrix") || + is(itemLabels, "association")) itemLabels <- itemLabels(itemLabels) + + # this calls encode for character i <- lapply(x, encode, itemLabels, itemMatrix = FALSE) if (itemMatrix == FALSE) return(i) @@ -99,22 +106,32 @@ setMethod("encode", signature(x = "list"), ## recode to make compatible setMethod("recode", signature(x = "itemMatrix"), function(x, itemLabels = NULL, match = NULL) { + + ### FIXME: Deprecated + if(!is.null(match)) message("recode: parameter 'match' is deprecated. Use 'itemLabels' instead.") + if(!is.null(itemLabels) && !is.null(match)) stop("'match' and 'itemLabels' cannot both be specified") if(is.null(itemLabels)) if(is.null(match)) stop("Either 'match' or 'itemLabels' has to be specified") else itemLabels <- itemLabels(match) + ### END + if(is(itemLabels, "itemMatrix") || + is(itemLabels, "association")) itemLabels <- itemLabels(itemLabels) + + ## nothing to do + if(identical(itemLabels(x), itemLabels)) return(x) + k <- match(itemLabels(x), itemLabels) if (any(is.na(k))) - stop ("All item labels in x must be contained in ", - "'itemLabels' or 'match'.") + stop ("All item labels in x must be contained in 'itemLabels'.", call. = FALSE) ## recode items if (any(k != seq(length(k)))) x@data <- .Call(R_recode_ngCMatrix, x@data, k) - ## enlarge + ## enlarge matrix for additional items if (x@data@Dim[1] < length(itemLabels)) x@data@Dim[1] <- length(itemLabels) @@ -129,4 +146,27 @@ setMethod("recode", signature(x = "itemMatrix"), } ) +setMethod("recode", signature(x = "itemsets"), + function(x, itemLabels = NULL, match = NULL) { + x@items <- recode(x@items, itemLabels, match) + x + } +) + +setMethod("recode", signature(x = "rules"), + function(x, itemLabels = NULL, match = NULL) { + x@lhs <- recode(x@lhs, itemLabels, match) + x@rhs <- recode(x@rhs, itemLabels, match) + x + } +) + +setMethod("compatible", signature(x = "itemMatrix"), + function(x, y) identical(itemLabels(x), itemLabels(y)) +) + +setMethod("compatible", signature(x = "associations"), + function(x, y) identical(itemLabels(x), itemLabels(y)) +) + ### diff --git a/R/itemMatrix.R b/R/itemMatrix.R index dcb1a53..eb27a63 100644 --- a/R/itemMatrix.R +++ b/R/itemMatrix.R @@ -133,12 +133,9 @@ setAs("itemMatrix", "list", setMethod("LIST", signature(from = "itemMatrix"), function(from, decode = TRUE) { - if (decode) { - to <- .Call(R_asList_ngCMatrix, from@data, itemLabels(from)) - names(to) <- itemsetInfo(from)[["itemsetID"]] - to - } else - .Call(R_asList_ngCMatrix, from@data, NULL) + l <- .Call(R_asList_ngCMatrix, from@data, if(decode) itemLabels(from) else NULL) + if(decode) names(l) <- itemsetInfo(from)[["itemsetID"]] + l } ) @@ -331,21 +328,30 @@ setMethod("c", signature(x = "itemMatrix"), for (y in args) { if (!is(y, "itemMatrix")) stop("can only combine itemMatrix") + x@itemsetInfo <- .combineMeta(x, y, "itemsetInfo") - k <- match(itemLabels(y), itemLabels(x)) - n <- which(is.na(k)) - if (length(n)) { - k[n] <- x@data@Dim[1] + seq(length(n)) - x@data@Dim[1] <- x@data@Dim[1] + length(n) - x@itemInfo <- rbind(x@itemInfo, - y@itemInfo[n,, drop = FALSE]) + + if(!compatible(x, y)) { + warning("Item coding not compatible, recoding item matrices.") + + # expand x if y has additional items + k <- match(itemLabels(y), itemLabels(x)) + n <- which(is.na(k)) + if (length(n)) { + k[n] <- x@data@Dim[1] + seq(length(n)) + x@data@Dim[1] <- x@data@Dim[1] + length(n) + x@itemInfo <- rbind(x@itemInfo, + y@itemInfo[n,, drop = FALSE]) + } + + # recode y to match x + if (any(k != seq_len(length(k)))) + y@data <- .Call(R_recode_ngCMatrix, y@data, k) + if (y@data@Dim[1] < x@data@Dim[1]) + y@data@Dim[1] <- x@data@Dim[1] } - if (any(k != seq_len(length(k)))) - y@data <- .Call(R_recode_ngCMatrix, y@data, k) - if (y@data@Dim[1] < x@data@Dim[1]) - y@data@Dim[1] <- x@data@Dim[1] - ## this is faste than x@data <- cbind(x@data, y@data) + ## this is faster than x@data <- cbind(x@data, y@data) x@data <- .Call(R_cbind_ngCMatrix, x@data, y@data) } validObject(x, complete = TRUE) @@ -396,16 +402,22 @@ setMethod("unique", signature(x = "itemMatrix"), ## and uses more efficient prefix tree C code setMethod("match", signature(x = "itemMatrix", table = "itemMatrix"), function(x, table, nomatch = NA_integer_, incomparables = NULL) { - k <- match(itemLabels(x), itemLabels(table)) - n <- which(is.na(k)) - if (length(n)) { - k[n] <- table@data@Dim[1] + seq(length(n)) - table@data@Dim[1] <- table@data@Dim[1] + length(n) + + if(!compatible(x, table)) { + warning("Item coding not compatible, recoding item matrices first.") + + k <- match(itemLabels(x), itemLabels(table)) + n <- which(is.na(k)) + if (length(n)) { + k[n] <- table@data@Dim[1] + seq(length(n)) + table@data@Dim[1] <- table@data@Dim[1] + length(n) + } + if (any(k != seq_len(length(k)))) + x@data <- .Call(R_recode_ngCMatrix, x@data, k) + if (x@data@Dim[1] < table@data@Dim[1]) + x@data@Dim[1] <- table@data@Dim[1] } - if (any(k != seq_len(length(k)))) - x@data <- .Call(R_recode_ngCMatrix, x@data, k) - if (x@data@Dim[1] < table@data@Dim[1]) - x@data@Dim[1] <- table@data@Dim[1] + i <- .Call(R_pnindex, table@data, x@data, FALSE) match(i, seq_len(length(table)), nomatch = nomatch, incomparables = incomparables) diff --git a/R/transactions.R b/R/transactions.R index 3c9fd9c..e3f4569 100644 --- a/R/transactions.R +++ b/R/transactions.R @@ -59,7 +59,7 @@ setMethod("LIST", signature(from = "transactions"), function(from, decode = TRUE) { l <- LIST(as(from, "itemMatrix"), decode) if(decode) names(l) <- transactionInfo(from)$transactionID - l + l }) setAs("data.frame", "transactions", diff --git a/README.md b/README.md index 37e328e..32d180e 100644 --- a/README.md +++ b/README.md @@ -164,5 +164,4 @@ Questions should be posted on [stackoverflow and tagged with arules](https://sta * Michael Hahsler, Sudheer Chelluboina, Kurt Hornik, and Christian Buchta. [The arules R-package ecosystem: Analyzing interesting patterns from large transaction datasets.](https://jmlr.csail.mit.edu/papers/v12/hahsler11a.html) _Journal of Machine Learning Research,_ 12:1977-1981, 2011. * Michael Hahsler, Bettina Grün and Kurt Hornik. [arules - A Computational Environment for Mining Association Rules and Frequent Item Sets.](https://dx.doi.org/10.18637/jss.v014.i15) _Journal of Statistical Software,_ 14(15), 2005. -* Hahsler, Michael. -[A Probabilistic Comparison of Commonly Used Interest Measures for Association Rules](https://michael.hahsler.net/research/association_rules/measures.html), 2015, URL: https://michael.hahsler.net/research/association_rules/measures.html. +* Hahsler, Michael. [A Probabilistic Comparison of Commonly Used Interest Measures for Association Rules](https://michael.hahsler.net/research/association_rules/measures.html), 2015, URL: https://michael.hahsler.net/research/association_rules/measures.html. diff --git a/man/Mushroom.Rd b/man/Mushroom.Rd index 1c28fe2..cbd4986 100644 --- a/man/Mushroom.Rd +++ b/man/Mushroom.Rd @@ -9,7 +9,7 @@ It contains information about 8124 mushrooms (transactions). 4208 (51.8\%) are edible and 3916 (48.2\%) - are poisonous. The data contains 22 nomoinal features plus the class attribure + are poisonous. The data contains 22 nominal features plus the class attribute (edible or not). These features were translated into 114 items. } diff --git a/man/apriori.Rd b/man/apriori.Rd index 99ddb54..7d4270a 100644 --- a/man/apriori.Rd +++ b/man/apriori.Rd @@ -16,7 +16,7 @@ apriori(data, parameter = NULL, appearance = NULL, control = NULL) \code{\linkS4class{transactions}} or any data structure which can be coerced into \code{\linkS4class{transactions}} (e.g., a binary - matrix or data.frame).} + matrix or a data.frame).} \item{parameter}{object of class \code{\linkS4class{APparameter}} or named list. The default behavior is to mine rules with minimum support of 0.1, @@ -33,7 +33,10 @@ apriori(data, parameter = NULL, appearance = NULL, control = NULL) algorithm (item sorting, report progress (verbose), etc.)} } \details{ - \bold{Automatic conversion to transactions.} + \bold{Warning about automatic conversion of matrices or data.frames to transactions.} + It is preferred to coerce data to transactions manually before calling \code{apriori} to have control over item coding. This is especially important when you are working with multiple datasets or several subsets of the same dataset. To read about item coding, see + \code{\link{itemCoding}}. + If a data.frame is specified as \code{x}, then the data is automatically converted into transactions by discretizing numeric data using \code{discretizeDF} and then coercion to transactions. The discretization may fail if the data is not well behaved. @@ -99,6 +102,10 @@ apriori(data, parameter = NULL, appearance = NULL, control = NULL) \author{Michael Hahsler and Bettina Gruen} \examples{ data("Adult") +## Note: Adult is alread a transactions dataset if you are using a data.frame then +## you should coerce it first to transactions using: +## yourTrans <- as(yourData, "transactions") + ## Mine association rules. rules <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.9, target = "rules")) @@ -108,6 +115,7 @@ summary(rules) \code{\link{APparameter-class}}, \code{\link{APcontrol-class}}, \code{\link{APappearance-class}}, + \code{\link{itemCoding}}, \code{\link{transactions-class}}, \code{\link{itemsets-class}}, \code{\link{rules-class}} diff --git a/man/associations-class.Rd b/man/associations-class.Rd index 8f70272..876a35f 100644 --- a/man/associations-class.Rd +++ b/man/associations-class.Rd @@ -29,7 +29,7 @@ associations. } \details{ -The implementations of \code{associations} store itemsets (e.g., the LHS and RHS of a rule) as objects of class \code{\link{itemMatrix}} (i.e., sparse binary matrices). Quality measures (e.g., support) are stored in a data.frame accessable via method \code{quality}. +The implementations of \code{associations} store itemsets (e.g., the LHS and RHS of a rule) as objects of class \code{\link{itemMatrix}} (i.e., sparse binary matrices). Quality measures (e.g., support) are stored in a data.frame accessible via method \code{quality}. Associations can store multisets with duplicated elements. Duplicated elements can result from combining several sets of associations. diff --git a/man/crossTable.Rd b/man/crossTable.Rd index 16dede0..0613797 100644 --- a/man/crossTable.Rd +++ b/man/crossTable.Rd @@ -14,9 +14,9 @@ crossTable(x, ...) \arguments{ \item{x}{ object to be cross-tabulated (\code{transactions} or \code{itemMatrix}).} - \item{measure}{ measure to return. Default is co-occurence counts. } + \item{measure}{ measure to return. Default is co-occurrence counts. } \item{sort}{ sort the items by support. } - \item{...}{ aditional arguments. } + \item{...}{ additional arguments. } } \value{ A symmetric matrix of n time n, where n is the number of items times diff --git a/man/discretize.Rd b/man/discretize.Rd index 5aff8b1..61f7c74 100644 --- a/man/discretize.Rd +++ b/man/discretize.Rd @@ -54,7 +54,7 @@ Discretize calculates breaks between intervals using various methods and then us Discretization may fail for several reasons. Some reasons are \itemize{ \item A variable contains only a single value. In this case, the variable should be dropped or directly converted into a factor with a single level (see \code{\link{factor}}). -\item Some caclulated breaks are not unique. This can happen for method frequency with very skewed data (e.g., a large portion of the values is 0). In this case, non-unique breaks are dropped with a warning. It would be probably better to look at the histogram of the data and decide on breaks for the method fixed. +\item Some calculated breaks are not unique. This can happen for method frequency with very skewed data (e.g., a large portion of the values is 0). In this case, non-unique breaks are dropped with a warning. It would be probably better to look at the histogram of the data and decide on breaks for the method fixed. } \code{discretize} only implements unsupervised discretization. See diff --git a/man/is.superset.Rd b/man/is.superset.Rd index 4217425..666748a 100644 --- a/man/is.superset.Rd +++ b/man/is.superset.Rd @@ -22,7 +22,7 @@ is.superset(x, y = NULL, proper = FALSE, sparse = TRUE, ...) the super or subset structure within set \code{x} is calculated.} \item{proper}{a logical indicating if all or just proper super or subsets.} \item{sparse}{a logical indicating if a sparse (ngCMatrix) rather than a - dense logical matrix sgould be returned. Sparse computation + dense logical matrix should be returned. Sparse computation preserves a significant amount of memory and is much faster for large sets.} \item{\dots}{ currently unused.} } diff --git a/man/itemCoding.Rd b/man/itemCoding.Rd index 4a3ae6f..e069a99 100644 --- a/man/itemCoding.Rd +++ b/man/itemCoding.Rd @@ -10,9 +10,14 @@ \alias{encode,character-method} \alias{recode} \alias{recode,itemMatrix-method} +\alias{recode,itemsets-method} +\alias{recode,rules-method} +\alias{compatible} +\alias{compatible,itemMatrix-method} +\alias{compatible,associations-method} \title{Item Coding --- Conversion between Item Labels and Column IDs} \description{ - Provides the generic functions and the S4 methods to translate between the binary representation in the itemMatrix format (used in transactions, rules and itemsets), item labels and numeric item IDs (i.e., the column numbers in the binary representation). + The order in which items are stored in an \code{itemMatrix} is called the \emph{item coding}. The following generic functions and S4 methods are used to translate between the binary representation in the itemMatrix format (used in transactions, rules and itemsets), item labels and numeric item IDs (i.e., the column numbers in the binary representation). } \usage{ encode(x, \ldots) @@ -20,8 +25,12 @@ encode(x, \ldots) \S4method{encode}{character}(x, itemLabels, itemMatrix = TRUE) \S4method{encode}{numeric}(x, itemLabels, itemMatrix = TRUE) +compatible(x, y) + recode(x, \ldots) \S4method{recode}{itemMatrix}(x, itemLabels = NULL, match = NULL) +\S4method{recode}{itemsets}(x, itemLabels = NULL, match = NULL) +\S4method{recode}{rules}(x, itemLabels = NULL, match = NULL) decode(x, \ldots) \S4method{decode}{list}(x, itemLabels) @@ -33,13 +42,12 @@ decode(x, \ldots) an object of class \code{itemMatrix} (for \code{recode}).} \item{itemLabels}{a vector of character strings used for coding where the position of an item label in the vector gives the item's column ID. - The used \code{itemLabels} vector can be obtained from \code{itemMatrix}, - \code{transactions} and \code{associations} by the - method \code{itemLabels}.} + Alternatively, a \code{itemMatrix}, + \code{transactions} or \code{associations} object can be specified and the item labels or these objects are used.} \item{itemMatrix}{return an object of class \code{itemMatrix} otherwise an object of the same class as \code{x} is returned.} -\item{match}{an \code{itemMatrix} object whose item coding \code{x} - should match.} +\item{y}{ an object of class \code{itemMatrix}, \code{transactions} or \code{associations} to compare item coding to \code{x}. } +\item{match}{ deprecated: used \code{itemLabels} instead.} \item{\ldots}{further arguments.} } \value{ @@ -52,16 +60,24 @@ For \code{encode} with \code{itemMatrix = TRUE} an object list or a vector. } \details{ + +\bold{Item compatibility:} +If you deal with several datasets or different subsets of the same dataset and want to combine or compate the found itemsets or rules, then you need to make sure that all transaction sets have a compatible item coding. That is, the sparse matrices representing the items have columns for the same items in exactly the same order. The coercion to transactions with \code{as(x, "transactions")} will create the item coding by adding items when they are encountered in the dataset. This can lead to different item codings (different order, missing items) for even only slightly different datasets. You can use the method \code{compatible} to check if two sets have the same item coding. + +If you work with many sets, then you should first define a common item coding by creating a vector with all possible item labels and then use either \code{encode} to create transactions or \code{recode} to make a different set compatible. + +The following function help with creating and changing the item coding to make them compatible. + \code{encode} converts from readable item labels to an itemMatrix using a given coding. With this method it is possible to create several compatible \code{itemMatrix} objects (i.e., use the same binary representation for items) from data. -\code{recode} recodes an itemMatrix object so its coding is compatible -with another itemMatrix object specified in \code{match} (i.e., the colums are reordered to match). - -\code{decode} converts from the colun IDs used in the temMatrix representation to +\code{decode} converts from the column IDs used in the itemMatrix representation to item labels. \code{decode} is used by \code{\link{LIST}}. + +\code{recode} recodes an itemMatrix object so its coding is compatible +with another itemMatrix object specified in \code{itemLabels} (i.e., the columns are reordered to match). } \seealso{ @@ -74,57 +90,104 @@ item labels. \code{decode} is used by \code{\link{LIST}}. data("Adult") ## Example 1: Manual decoding -## get code +## Extract the item coding as a vector of item labels. iLabels <- itemLabels(Adult) head(iLabels) -## get undecoded list and decode in a second step +## get undecoded list (itemIDs) list <- LIST(Adult[1:5], decode = FALSE) list +## decode itemIDs by replacing them with the appropriate item label decode(list, itemLabels = iLabels) -## Example 2: Manually create an itemMatrix + +## Example 2: Manually create an itemMatrix using iLabels as the common item coding data <- list( c("income=small", "age=Young"), c("income=large", "age=Middle-aged") ) -iM <- encode(data, iLabels) +# Option a: encode to match the item coding in Adult +iM <- encode(data, itemLabels = Adult) iM - inspect(iM) +compatible(iM, Adult) + +# Option b: coercion plus recode to make it compatible to Adult +# (note: the coding has 115 item columns after recode) +iM <- as(data, "itemMatrix") +iM +compatible(iM, Adult) -## use the itemMatrix to create transactions -as(iM, "transactions") +iM <- recode(iM, itemLabels = Adult) +iM +compatible(iM, Adult) -## Example 3: use recode +## Example 3: use recode to make itemMatrices compatible ## select first 100 transactions and all education-related items sub <- Adult[1:100, itemInfo(Adult)$variables == "education"] itemLabels(sub) image(sub) +## After choosing only a subset of items (columns), the item coding is now +## no longer compatible with the Adult dataset +compatible(sub, Adult) + ## recode to match Adult again -sub.recoded <- recode(sub, match = Adult) +sub.recoded <- recode(sub, itemLabels = Adult) image(sub.recoded) + ## Example 4: manually create 2 new transaction for the Adult data set ## Note: check itemLabels(Adult) to see the available labels for items -twoTransactions <- as(encode(list( - c("age=Young", "relationship=Unmarried"), - c("age=Senior") - ), itemLabels = itemLabels(Adult)), - "transactions") - +twoTransactions <- as( + encode(list( + c("age=Young", "relationship=Unmarried"), + c("age=Senior") + ), itemLabels = Adult), + "transactions") + +twoTransactions inspect(twoTransactions) -## Example 5: manually create a rule and calculate interest measures + +## Example 5: Use a common item coding + +# coercion to transactions will produce different item codings +trans1 <- as(list( + c("age=Young", "relationship=Unmarried"), + c("age=Senior") + ), "transactions") +trans1 + +trans2 <- as(list( + c("age=Middle-aged", "relationship=Married"), + c("relationship=Unmarried", "age=Young") + ), "transactions") +trans2 + +compatible(trans1, trans2) + +# produce common item coding (all item labels in the two sets) +commonItemLabels <- union(itemLabels(trans1), itemLabels(trans2)) +commonItemLabels + +trans1 <- recode(trans1, itemLabels = commonItemLabels) +trans1 +trans2 <- recode(trans2, itemLabels = commonItemLabels) +trans2 + +compatible(trans1, trans2) + + +## Example 6: manually create a rule and calculate interest measures aRule <- new("rules", lhs = encode(list(c("age=Young", "relationship=Unmarried")), - itemLabels = itemLabels(Adult)), + itemLabels = Adult), rhs = encode(list(c("income=small")), - itemLabels = itemLabels(Adult)) + itemLabels = Adult) ) quality(aRule) <- interestMeasure(aRule, diff --git a/man/itemMatrix-class.Rd b/man/itemMatrix-class.Rd index 6db4391..c7f6f69 100644 --- a/man/itemMatrix-class.Rd +++ b/man/itemMatrix-class.Rd @@ -51,8 +51,10 @@ itemsets or transactions) and the corresponding item labels. } \details{ -Sets of itemsets are represented as sparse binary matrices. -If you work with several itemMatrices at the same time (e.g., +Sets of itemsets (or transactions) are represented as a compressed sparse binary matrix. +Columns represent items and rows are the set/transactions. In the compressed form, each itemset is a vector of column indices (called item IDs) representing the items. + +\bold{Note:} If you work with several itemMatrices at the same time (e.g., several transaction sets, lhs and rhs of a rule, etc.), then the encoding (itemLabes and order of the items in the binary matrix) in the different itemMatrices is important and needs to conform. @@ -205,37 +207,72 @@ See \code{\link{itemCoding}} to learn how to encode and recode itemMatrix object \examples{ set.seed(1234) -## Generate random data and coerce data to itemMatrix. -m <- matrix(runif(100000)>0.8, ncol=20) -dimnames(m) <- list(NULL, paste("item", c(1:20), sep="")) -i <- as(m, "itemMatrix") +## Generate a logical matrix with 5000 random itemsets for 20 items +m <- matrix(runif(5000*20)>0.8, ncol=20, + dimnames = list(NULL, paste("item", c(1:20), sep=""))) +head(m) + +## Coerce the logical matrix into an itemMatrix object +imatrix <- as(m, "itemMatrix") +imatrix + +## An itemMatrix contains a set of itemsets (each row is an itemset). +## The length of the set is the number of rows. +length(imatrix) -## Get the number of elements (rows) in the itemMatrix. -length(i) +## The sparese matrix also has regular matrix dimensions. +dim(imatrix) +nrow(imatrix) +ncol(imatrix) + +## Subsetting: Get first 5 elements (rows) of the itemMatrix. This can be done in +## several ways. +imatrix[1:5] ### get elements 1:5 +imatrix[1:5, ] ### Matrix subsetting for rows 1:5 +head(imatrix, n = 5) ### head() ## Get first 5 elements (rows) of the itemMatrix as list. -as(i[1:5], "list") +as(imatrix[1:5], "list") ## Get first 5 elements (rows) of the itemMatrix as matrix. -as(i[1:5], "matrix") +as(imatrix[1:5], "matrix") ## Get first 5 elements (rows) of the itemMatrix as sparse ngCMatrix. -## Warning: for efficiency reasons, the ngCMatrix you get is transposed! -as(i[1:5], "ngCMatrix") +## Warning: For efficiency reasons, the ngCMatrix is transposed! +as(imatrix[1:5], "ngCMatrix") ## Get labels for the first 5 itemsets (first default and then with ## custom formating) -labels(i[1:5]) -labels(i[1:5], itemSep = " + ", setStart = "", setEnd = "") - -## create itemsets from itemMatrix -is <- new("itemsets", items = i[1:3]) -inspect(is) - -## create rules (rhs and lhs cannot share items so I use -## itemSetdiff here). Also assign (random) support. -rules <- new("rules", lhs=itemSetdiff(i[4:6],i[1:3]), rhs=i[1:3], - quality = data.frame(support = runif(3))) -inspect(rules) +labels(imatrix[1:5]) +labels(imatrix[1:5], itemSep = " + ", setStart = "", setEnd = "") + +## Create itemsets manually from an itemMatrix. Itemsets contain items in the form of +## an itemMatrix and additional quality measures (not supplied in the example). +is <- new("itemsets", items = imatrix) +is +inspect(head(is, n = 3)) + + +## Create rules manually. I use imatrix[4:6] for the lhs of the rules and +## imatrix[1:3] for the rhs. Rhs and lhs cannot share items so I use +## itemSetdiff here. I also assign missing values for the quality measures support +## and confidence. +rules <- new("rules", + lhs = itemSetdiff(imatrix[4:6], imatrix[1:3]), + rhs = imatrix[1:3], + quality = data.frame(support = c(NA, NA, NA), + confidence = c(NA, NA, NA) + )) +rules +inspect(rules) + +## Manually create a itemMatrix with an item encoding that matches imatrix (20 items in order +## item1, item2, ..., item20) +itemset_list <- list(c("item1","item2"), + c("item3")) + +imatrix_new <- encode(itemset_list, itemLabels = imatrix) +imatrix_new +compatible(imatrix_new, imatrix) } \keyword{classes} diff --git a/man/match.Rd b/man/match.Rd index 6d9c16f..90a0472 100644 --- a/man/match.Rd +++ b/man/match.Rd @@ -27,7 +27,7 @@ The binary matching operators or often used in \code{\link{subset}}. } \usage{ -match(x, table, nomatch = NA_integer_, incomparables = NULL) +match(x, table, nomatch = NA_integer_, incomparables = NULL) x \%in\% table x \%pin\% table diff --git a/man/rules-class.Rd b/man/rules-class.Rd index d5fe6f9..d4a5603 100644 --- a/man/rules-class.Rd +++ b/man/rules-class.Rd @@ -110,10 +110,10 @@ The \code{rules} class represents a set of rules. } } \details{ -Rules are usually created by calling an assoication rule mining algorithm like \code{\link{apriori}}. +Rules are usually created by calling an association rule mining algorithm like \code{\link{apriori}}. Rules store the LHS and the RHS separately as objects of class \code{\link{itemMatrix}}. -To create rules manualy, the itemMatrix for the LHS and the RHS +To create rules manually, the itemMatrix for the LHS and the RHS of the rules can be created using \code{\link{itemCoding}}. Note the two matrices need to have the itemLabels (i.e., columns of the sparse matrix) in the same order. diff --git a/man/write.Rd b/man/write.Rd index ff16ce8..a766923 100644 --- a/man/write.Rd +++ b/man/write.Rd @@ -41,7 +41,7 @@ write(x, file = "",\dots) Note: To save and load associations in compact form, use \code{save} and \code{load} from the \pkg{base} package. Alternatively, association can be written to disk in PMML (Predictive Model Markup Language) - via \code{write.PMML}. This requiresvpackages \pkg{pmml}. + via \code{write.PMML}. This requires package \pkg{pmml}. } \seealso{ \code{\link{read.transactions}} for reading transactions from a file, diff --git a/tests/testthat/test-itemCoding.R b/tests/testthat/test-itemCoding.R new file mode 100644 index 0000000..6821cbd --- /dev/null +++ b/tests/testthat/test-itemCoding.R @@ -0,0 +1,43 @@ +library("testthat") +library("arules") + +context("itemCoding") + +data("Adult") + +list <- LIST(Adult[1:5], decode = FALSE) +list_decoded1 <- decode(list, itemLabels = itemLabels(Adult)) +list_decoded2 <- LIST(Adult[1:5]) + +### no list element names for decode = FALSE +names(list_decoded2) <- NULL + +expect_equal(list_decoded1, list_decoded2) + + +## Example 2: Manually create an itemMatrix +data <- list( + c("income=small", "age=Young"), + c("income=large", "age=Middle-aged") +) + +iM <- encode(data, itemLabels = Adult) + + +### non existing item are dropped with a warning now. +data2 <- list( + c("income=small", "age=Young"), + c("income=large", "not_an_item") +) + +expect_warning(iM <- encode(data2, itemLabels = Adult)) +expect_identical(size(iM), c(2L, 1L)) + +### test encoding +iM <- encode(list(c(1,2,3), c(4,5)), itemLabels(Adult)) +expect_identical(size(iM), c(3L, 2L)) + +expect_error(encode(list(c(1,2,3), c(4,5.5)), itemLabels(Adult))) +expect_error(encode(list(c(1,2,3), c(4,5, nitems(Adult)+1L)), itemLabels(Adult))) + + diff --git a/tests/testthat/test-sets.R b/tests/testthat/test-sets.R index 0a30de3..45c3f5d 100644 --- a/tests/testthat/test-sets.R +++ b/tests/testthat/test-sets.R @@ -90,3 +90,14 @@ expect_false(is.element(rules[15], r1)) # union(setA,setB)= setA + setB - intersect(setA,setB) expect_equal(length(union(r1, r2)), length(c(r1, r2)) - length(intersect(r1, r2))) + +# Test setequal with incompatible itemMatrices containing the same itemsets +d1 <- as(data, "itemMatrix") +expect_true(setequal(d1,d1)) + +d2 <- merge(d1[,6:7], d1[,1:5]) +compatible(d1, d2) +expect_warning(expect_true(setequal(d1,d2))) +expect_warning(expect_true(setequal(union(d1, d2), intersect(d1, d2)))) + + diff --git a/tests/testthat/test-transactions.R b/tests/testthat/test-transactions.R index 773ce05..fe21563 100644 --- a/tests/testthat/test-transactions.R +++ b/tests/testthat/test-transactions.R @@ -41,8 +41,8 @@ expect_identical(dimnames(m), dimnames(trans)) expect_equal(c(trans, trans), as(rbind(m, m),"transactions")) ## combine with missing items (needs recoding) -expect_true(all(as(c(trans[,-2], trans[,-3]), "matrix")[1:8,"b"]) == FALSE) -expect_true(all(as(c(trans[,-2], trans[,-3]), "matrix")[9:15,"c"]) == FALSE) +expect_warning(expect_true(all(as(c(trans[,-2], trans[,-3]), "matrix")[1:8,"b"]) == FALSE)) +expect_warning(expect_true(all(as(c(trans[,-2], trans[,-3]), "matrix")[9:15,"c"]) == FALSE)) l <- LIST(trans, decode = FALSE) expect_identical(length(l), nrow(trans)) diff --git a/vignettes/arules.Rnw b/vignettes/arules.Rnw index 817c058..f5c476e 100644 --- a/vignettes/arules.Rnw +++ b/vignettes/arules.Rnw @@ -382,7 +382,9 @@ itemsets. The matrix entries represent the presence (1) or absence (0) of an item in a particular itemset. An example of a binary incidence matrix containing itemsets for the example database in Figure~\ref{table:supermarket} on Page~\pageref{table:supermarket} is -shown in Figure~\ref{fig:itemsetMatrix}. Note that we need to store +shown in Figure~\ref{fig:itemsetMatrix}. The order in which the items are stores (i.e., the order of the columns) is called in \pkg{arules} the \emph{item coding} and +is important when working directly with the binary matrix. +Note also that we need to store collections of itemsets with possibly duplicated elements (identical rows), i.e, itemsets containing exactly the same items. This is necessary, since a transaction database can contain different @@ -457,7 +459,7 @@ Technically, \func{length} returns the number of rows in the matrix which is equal to the first element returned by \func{dim}. %\pkg{arules} also provides set operations including \func{union}, %\func{intersect} and \func{setequal}. -Identical itemsets can be found with \func{duplicated}, and duplications +Identical itemsets can be found with \func{duplicated}, and duplicates can be removed with \func{unique}. \func{match} can be used to find matching elements in two collections of itemsets. @@ -537,7 +539,7 @@ Transaction data are normally recorded by point-of-sale scanners and often consists of tuples of the form: \begin{displaymath} -<\emph{transaction ID}, \emph{item ID}, \ldots > +<\emph{transaction ID}, \emph{item ID}, \emph{item ID}, \ldots > \end{displaymath} All tuples with the same transaction ID form a single transaction which @@ -1117,6 +1119,33 @@ predicting memberships for new data. A small example can be found in~\cite{arules:Hahsler+Hornik:2007b}. +\subsection{Support for Item Hierarchies} + +Often an item hierarchy is available for datasets used for association rule mining. For example in a supermarket dataset items like "bread" and "beagle" might belong to the item group (category) "baked goods." Transactions can store item hierarchies as additional columns in the \code{itemInfo} \code{data.frame}. +\pkg{arules} provides support to use an item hierarchy to perform analysis at + a group level of the item hierarchy using \code{aggregate()}, and to +perform multi-level analysis to uncover relationships between individual items and item groups with \code{addAggregate()}. See \code{? hierarchy} for details and examples. + + +\subsection{Item Coding} +Several applications require taking care of how the items are represented in the binary matrices used in transactions and associations. The order in which items are +used as columns in objects of class \code{itemMatrix} is called in \pkg{arules} the +\emph{item coding}. Situations where consistent item coding is important include: + +\begin{itemize} +\item Mining several different data sets or different subsets of a data set (e.g., samples) and +performing computations on the resulting associations (e.g., comparing or combining rules). +\item Creating transactions, itemsets or association rules manually. +\end{itemize} + +The item coding is typically determined when data is coerced to transactions with +\code{as(x, "transactions")} and this process can lead to different item codings for +slightly different data sets. The methods \func{encode} and \func{recode} can be used to create and change the item coding to make the representation of transactions, itemsets and rules compatible. To check if two objects use the same item coding, method \func{compatible} +can be used. + +More information and examples on item coding can be found in the manual page \code{? itemCoding}. + + \section{Examples} \label{sec:examples} @@ -1268,7 +1297,7 @@ data, the attributes were used to predict the income level of individuals. We added the attribute \code{income} with levels \code{small} and \code{large}, representing an income of $\le$~USD~50,000 and $>$~USD~50,000, respectively. This data is -included in \pkg{arules} as the data set \code{AdultUCI}. +included in \pkg{arules} as a \code{data.frame} the data set \code{AdultUCI}. <>= @@ -1280,7 +1309,9 @@ AdultUCI[1:2,] \code{AdultUCI} contains a mixture of categorical and metric attributes and needs some preparations before it can be transformed into -transaction data suitable for association mining. +transaction data suitable for association mining. While the \code{apriori} function +will try to convert the data.frame to transactions by performing discretization of numbers, it is typically preferred to prepare the data and coerce the dataset into transactions manually. + First, we remove the two attributes \code{fnlwgt} and \code{education-num}. The first attribute is a weight calculated by the creators of the data set from control data provided by @@ -1324,19 +1355,17 @@ AdultUCI[[ "capital-loss"]] <- ordered(cut(AdultUCI[[ "capital-loss"]], labels = c("none", "low", "high")) @ -Now, the data can be automatically recoded as -a binary incidence matrix by coercing the data set to -\class{transactions}. +Now, the data can be coerced to +\class{transactions} resulting in +a binary incidence matrix appropriate for association rule mining. <>= Adult <- as(AdultUCI, "transactions") Adult @ -The remaining \Sexpr{dim(Adult)[2]} categorical attributes were -automatically recoded into \Sexpr{dim(Adult)[2]} -binary items. During encoding the item labels were generated in the -form of +The remaining categorical attributes were +automatically recoded into binary items. During encoding the item labels were generated in the form of \texttt{<\emph{variable name}>=<\emph{category label}>}. Note that for cases with missing values all items corresponding to the attributes with the missing values were set to zero.