diff --git a/03-Classification/email_classify.R b/03-Classification/email_classify.R index 2378cae..83da4b6 100644 --- a/03-Classification/email_classify.R +++ b/03-Classification/email_classify.R @@ -1,23 +1,23 @@ -# File-Name: email_classify.R -# Date: 2012-02-10 +# File-Name: email_classify.R +# Date: 2012-02-10 # Author: Drew Conway (drew.conway@nyu.edu) # Purpose: Code for Chapter 3. In this case we introduce the notion of binary classification. -# In machine learning this is a method for determining what of two categories a -# given observation belongs to. To show this, we will create a simple naive Bayes +# In machine learning this is a method for determining what of two categories a +# given observation belongs to. To show this, we will create a simple naive Bayes # classifier for SPAM email detection, and visualize the results. # Data Used: Email messages contained in data/ directory, source: http://spamassassin.apache.org/publiccorpus/ # Packages Used: tm, ggplot2 -# All source code is copyright (c) 2012, under the Simplified BSD License. +# All source code is copyright (c) 2012, under the Simplified BSD License. # For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php -# All images and materials produced by this code are licensed under the Creative Commons +# All images and materials produced by this code are licensed under the Creative Commons # Attribution-Share Alike 3.0 United States License: http://creativecommons.org/licenses/by-sa/3.0/us/ # All rights reserved. -# NOTE: If you are running this in the R console you must use the 'setwd' command to set the -# working directory for the console to whereever you have saved this file prior to running. +# NOTE: If you are running this in the R console you must use the 'setwd' command to set the +# working directory for the console to wherever you have saved this file prior to running. # Otherwise you will see errors when loading data or saving figures! # Load libraries @@ -55,11 +55,13 @@ ggsave(plot = ex1, width = 10) # Return a single element vector of just the email body -# This is a very simple approach, as we are only using +# This is a very simple approach, as we are only using # words as features get.msg <- function(path) { - con <- file(path, open = "rt", encoding = "latin1") + # This following line fails on R 3.0.3, Win7 x64 + # con <- file(path, open = "rt", encoding = "latin1") + con <- file(path, encoding = "latin1") text <- readLines(con) # The message always begins after the first full line break msg <- text[seq(which(text == "")[1] + 1, length(text), 1)] @@ -68,8 +70,8 @@ get.msg <- function(path) } # Create a TermDocumentMatrix (TDM) from the corpus of SPAM email. -# The TDM control can be modified, and the sparsity level can be -# altered. This TDM is used to create the feature set used to do +# The TDM control can be modified, and the sparsity level can be +# altered. This TDM is used to create the feature set used to do # train our classifier. get.tdm <- function(doc.vec) { @@ -82,8 +84,8 @@ get.tdm <- function(doc.vec) return(doc.dtm) } -# This function takes a file path to an email file and a string, -# the term parameter, and returns the count of that term in +# This function takes a file path to an email file and a string, +# the term parameter, and returns the count of that term in # the email body. count.word <- function(path, term) { @@ -100,14 +102,14 @@ count.word <- function(path, term) return(ifelse(length(term.freq) > 0, term.freq, 0)) } -# This is the our workhorse function for classifying email. It takes -# two required paramters: a file path to an email to classify, and -# a data frame of the trained data. The function also takes two +# This is the our workhorse function for classifying email. It takes +# two required parameters: a file path to an email to classify, and +# a data frame of the trained data. The function also takes two # optional parameters. First, a prior over the probability that an email # is SPAM, which we set to 0.5 (naive), and constant value for the # probability on words in the email that are not in our training data. # The function returns the naive Bayes probability that the given email -# is SPAM. +# is SPAM. classify.email <- function(path, training.df, prior = 0.5, c = 1e-6) { # Here, we use many of the support functions to get the @@ -188,16 +190,16 @@ easyham.df <- transform(easyham.df, density = easyham.density, occurrence = easyham.occurrence) -# Run classifer against HARD HAM +# Run classifier against HARD HAM hardham.docs <- dir(hardham.path) hardham.docs <- hardham.docs[which(hardham.docs != "cmds")] hardham.spamtest <- sapply(hardham.docs, function(p) classify.email(file.path(hardham.path, p), training.df = spam.df)) - + hardham.hamtest <- sapply(hardham.docs, function(p) classify.email(file.path(hardham.path, p), training.df = easyham.df)) - + hardham.res <- ifelse(hardham.spamtest > hardham.hamtest, TRUE, FALSE) @@ -234,7 +236,7 @@ ggsave(plot = init.plot1, filename = file.path("images", "01_init_plot1.pdf"), width = 10, height = 10) - + init.plot2 <- ggplot(init.df, aes(x = html, y = table)) + geom_point(aes(shape = type), position = "jitter") + scale_shape_manual(values = c("SPAM" = 1, "EASYHAM" = 3), name = "Email Type") + @@ -247,7 +249,7 @@ ggsave(plot = init.plot2, width = 10, height = 10) -# Finally, attempt to classify the HARDHAM data using the classifer developed above. +# Finally, attempt to classify the HARDHAM data using the classifier developed above. # The rule is to classify a message as SPAM if Pr(email) = SPAM > Pr(email) = HAM spam.classifier <- function(path) {