From 883658093a3542adeb5b5ba7939a3e224e79b313 Mon Sep 17 00:00:00 2001
From: jnjcc <jnjcc@live.com>
Date: Tue, 29 Jul 2014 03:12:26 +1200
Subject: [PATCH 1/3] Remove some extra whitespaces in email_classify.R

---
 03-Classification/email_classify.R | 36 +++++++++++++++---------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/03-Classification/email_classify.R b/03-Classification/email_classify.R
index 2378cae..b062e9e 100644
--- a/03-Classification/email_classify.R
+++ b/03-Classification/email_classify.R
@@ -1,22 +1,22 @@
-# File-Name:       email_classify.R           
-# Date:            2012-02-10                                
+# File-Name:       email_classify.R
+# Date:            2012-02-10
 # Author:          Drew Conway (drew.conway@nyu.edu)
 # Purpose:         Code for Chapter 3. In this case we introduce the notion of binary classification.
-#                   In machine learning this is a method for determining what of two categories a 
-#                   given observation belongs to.  To show this, we will create a simple naive Bayes 
+#                   In machine learning this is a method for determining what of two categories a
+#                   given observation belongs to.  To show this, we will create a simple naive Bayes
 #                   classifier for SPAM email detection, and visualize the results.
 # Data Used:       Email messages contained in data/ directory, source: http://spamassassin.apache.org/publiccorpus/
 # Packages Used:   tm, ggplot2
 
-# All source code is copyright (c) 2012, under the Simplified BSD License.  
+# All source code is copyright (c) 2012, under the Simplified BSD License.
 # For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php
 
-# All images and materials produced by this code are licensed under the Creative Commons 
+# All images and materials produced by this code are licensed under the Creative Commons
 # Attribution-Share Alike 3.0 United States License: http://creativecommons.org/licenses/by-sa/3.0/us/
 
 # All rights reserved.
 
-# NOTE: If you are running this in the R console you must use the 'setwd' command to set the 
+# NOTE: If you are running this in the R console you must use the 'setwd' command to set the
 # working directory for the console to whereever you have saved this file prior to running.
 # Otherwise you will see errors when loading data or saving figures!
 
@@ -55,7 +55,7 @@ ggsave(plot = ex1,
        width = 10)
 
 # Return a single element vector of just the email body
-# This is a very simple approach, as we are only using 
+# This is a very simple approach, as we are only using
 # words as features
 get.msg <- function(path)
 {
@@ -68,8 +68,8 @@ get.msg <- function(path)
 }
 
 # Create a TermDocumentMatrix (TDM) from the corpus of SPAM email.
-# The TDM control can be modified, and the sparsity level can be 
-# altered.  This TDM is used to create the feature set used to do 
+# The TDM control can be modified, and the sparsity level can be
+# altered.  This TDM is used to create the feature set used to do
 # train our classifier.
 get.tdm <- function(doc.vec)
 {
@@ -82,8 +82,8 @@ get.tdm <- function(doc.vec)
   return(doc.dtm)
 }
 
-# This function takes a file path to an email file and a string, 
-# the term parameter, and returns the count of that term in 
+# This function takes a file path to an email file and a string,
+# the term parameter, and returns the count of that term in
 # the email body.
 count.word <- function(path, term)
 {
@@ -100,14 +100,14 @@ count.word <- function(path, term)
   return(ifelse(length(term.freq) > 0, term.freq, 0))
 }
 
-# This is the our workhorse function for classifying email.  It takes 
+# This is the our workhorse function for classifying email.  It takes
 # two required paramters: a file path to an email to classify, and
-# a data frame of the trained data.  The function also takes two 
+# a data frame of the trained data.  The function also takes two
 # optional parameters.  First, a prior over the probability that an email
 # is SPAM, which we set to 0.5 (naive), and constant value for the
 # probability on words in the email that are not in our training data.
 # The function returns the naive Bayes probability that the given email
-# is SPAM.  
+# is SPAM.
 classify.email <- function(path, training.df, prior = 0.5, c = 1e-6)
 {
   # Here, we use many of the support functions to get the
@@ -194,10 +194,10 @@ hardham.docs <- hardham.docs[which(hardham.docs != "cmds")]
 
 hardham.spamtest <- sapply(hardham.docs,
                            function(p) classify.email(file.path(hardham.path, p), training.df = spam.df))
-    
+
 hardham.hamtest <- sapply(hardham.docs,
                           function(p) classify.email(file.path(hardham.path, p), training.df = easyham.df))
-    
+
 hardham.res <- ifelse(hardham.spamtest > hardham.hamtest,
                       TRUE,
                       FALSE)
@@ -234,7 +234,7 @@ ggsave(plot = init.plot1,
        filename = file.path("images", "01_init_plot1.pdf"),
        width = 10,
        height = 10)
-    
+
 init.plot2 <- ggplot(init.df, aes(x = html, y = table)) +
   geom_point(aes(shape = type), position = "jitter") +
   scale_shape_manual(values = c("SPAM" = 1, "EASYHAM" = 3), name = "Email Type") +

From 0773eb47f672b0e26235338ed6c030aa69cdeada Mon Sep 17 00:00:00 2001
From: jnjcc <jnjcc@live.com>
Date: Tue, 29 Jul 2014 21:59:11 +1200
Subject: [PATCH 2/3] Fix some typos in email_classify.R

---
 03-Classification/email_classify.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/03-Classification/email_classify.R b/03-Classification/email_classify.R
index b062e9e..d196903 100644
--- a/03-Classification/email_classify.R
+++ b/03-Classification/email_classify.R
@@ -17,7 +17,7 @@
 # All rights reserved.
 
 # NOTE: If you are running this in the R console you must use the 'setwd' command to set the
-# working directory for the console to whereever you have saved this file prior to running.
+# working directory for the console to wherever you have saved this file prior to running.
 # Otherwise you will see errors when loading data or saving figures!
 
 # Load libraries
@@ -101,7 +101,7 @@ count.word <- function(path, term)
 }
 
 # This is the our workhorse function for classifying email.  It takes
-# two required paramters: a file path to an email to classify, and
+# two required parameters: a file path to an email to classify, and
 # a data frame of the trained data.  The function also takes two
 # optional parameters.  First, a prior over the probability that an email
 # is SPAM, which we set to 0.5 (naive), and constant value for the
@@ -188,7 +188,7 @@ easyham.df <- transform(easyham.df,
                         density = easyham.density,
                         occurrence = easyham.occurrence)
 
-# Run classifer against HARD HAM
+# Run classifier against HARD HAM
 hardham.docs <- dir(hardham.path)
 hardham.docs <- hardham.docs[which(hardham.docs != "cmds")]
 
@@ -247,7 +247,7 @@ ggsave(plot = init.plot2,
        width = 10,
        height = 10)
 
-# Finally, attempt to classify the HARDHAM data using the classifer developed above.
+# Finally, attempt to classify the HARDHAM data using the classifier developed above.
 # The rule is to classify a message as SPAM if Pr(email) = SPAM > Pr(email) = HAM
 spam.classifier <- function(path)
 {

From 1782e55d3e28d881c69433e22604ffc3dedb8874 Mon Sep 17 00:00:00 2001
From: jnjcc <jnjcc@live.com>
Date: Tue, 29 Jul 2014 22:02:31 +1200
Subject: [PATCH 3/3] Fix email_classify.R under windows

---
 03-Classification/email_classify.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/03-Classification/email_classify.R b/03-Classification/email_classify.R
index d196903..83da4b6 100644
--- a/03-Classification/email_classify.R
+++ b/03-Classification/email_classify.R
@@ -59,7 +59,9 @@ ggsave(plot = ex1,
 # words as features
 get.msg <- function(path)
 {
-  con <- file(path, open = "rt", encoding = "latin1")
+  # This following line fails on R 3.0.3, Win7 x64
+  # con <- file(path, open = "rt", encoding = "latin1")
+  con <- file(path, encoding = "latin1")
   text <- readLines(con)
   # The message always begins after the first full line break
   msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]