Merge pull request #119 from quanteda/tokenize-function

Tokenize and noun-phrase extraction
quanteda · Nov 12, 2018 · 0e21059 · 0e21059
2 parents 186eb12 + 586a55f
commit 0e21059
Show file tree

Hide file tree

Showing 17 changed files with 948 additions and 22 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -23,4 +23,4 @@ Suggests:
 URL: http://github.com/quanteda/spacyr
 Encoding: UTF-8
 BugReports: https://github.com/quanteda/spacyr/issues
-RoxygenNote: 6.1.0
+RoxygenNote: 6.1.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,29 +2,38 @@
 
 S3method(entity_consolidate,spacyr_parsed)
 S3method(entity_extract,spacyr_parsed)
+S3method(spacy_extract_nounphrases,character)
+S3method(spacy_extract_nounphrases,data.frame)
 S3method(spacy_parse,character)
 S3method(spacy_parse,data.frame)
+S3method(spacy_tokenize,character)
+S3method(spacy_tokenize,data.frame)
 export(entity_consolidate)
 export(entity_extract)
 export(find_spacy)
 export(find_spacy_env)
 export(get_attrs)
 export(get_dependency)
 export(get_named_entities)
+export(get_noun_phrases)
 export(get_ntokens)
 export(get_ntokens_by_sent)
 export(get_tags)
 export(get_tokens)
 export(process_document)
 export(spacy_download_langmodel)
 export(spacy_download_langmodel_virtualenv)
+export(spacy_extract_nounphrases)
 export(spacy_finalize)
 export(spacy_initialize)
 export(spacy_install)
 export(spacy_install_virtualenv)
 export(spacy_parse)
+export(spacy_tokenize)
 export(spacy_uninstall)
 export(spacy_upgrade)
 importFrom(data.table,as.data.table)
 importFrom(data.table,data.table)
+importFrom(data.table,setDT)
+importFrom(data.table,setnames)
 importFrom(methods,new)
diff --git a/R/parse-extractor-functions.R b/R/parse-extractor-functions.R
@@ -123,6 +123,34 @@ get_dependency <- function(spacy_out) {
     return(list(head_id = head_id, dep_rel = dep_rel))
 }
 
+
+#' @rdname get-functions
+#' @return \code{get_noun_phrases} returns a data.frame of noun phrases.
+#' @export
+#' @keywords internal
+get_noun_phrases <- function(spacy_out) {
+    # get ids of head of each token
+    spacyr_pyassign("timestamps", spacy_out$timestamps)
+    spacyr_pyassign("docnames", spacy_out$docnames)
+    command_str <- paste("noun_phrases = spobj.extract_nounphrases_dataframe(timestamps = timestamps,",
+                         "docnames = docnames,",
+                         "multithread = False)")
+    spacyr_pyexec(command_str)
+    noun_phrases <- spacyr_pyget("noun_phrases")
+
+    doc_id <- names(noun_phrases)
+    data_out <- 
+        data.table::rbindlist(lapply(doc_id, function(x) {
+            df <- as.data.frame(noun_phrases[[x]], stringsAsFactors = FALSE)
+            df$doc_id <- x
+            return(df)
+        }))
+    data_out[, start_id := start_id + 1][, root_id := root_id + 1]
+    data.table::setDF(data_out)
+    data_out <- data_out[, c(6, 1:5)]
+    return(data_out)
+}
+
 #' @rdname get-functions
 #' @return \code{get_ntokens} returns a data.frame of dependency relations
 #' @export

diff --git a/R/spacy_extract_nounphrases.R b/R/spacy_extract_nounphrases.R
@@ -0,0 +1,130 @@
+#' Extract noun phrases from a text
+#' 
+#' This function extracts noun-phrases from documents, based on the
+#' \code{noun_chunks} attributes of documents objects parsed by spaCy (see
+#' \url{https://spacy.io/usage/linguistic-features#noun-chunks}).
+#' 
+#' @param x a character object or a TIF-compliant
+#'   corpus data.frame (see \url{https://github.com/ropensci/tif})
+#' @param multithread logical; If true, the processing is parallelized using pipe 
+#'   functionality of spacy (\url{https://spacy.io/api/pipe}).
+#' @param output type of returned object, either \code{"list"} or
+#'   \code{"data.frame"}.
+#' @param ... unused
+#' @details When the option \code{output = "data.frame"} is selected, the
+#'   function returns a \code{data.frame} with the following fields.
+#' \describe{\item{\code{text}}{contents of noun-phrase}
+#' \item{\code{root_text}}{contents of root token}
+#' \item{\code{start_id}}{serial number ID of starting token. This number
+#' corresponds with the number of \code{data.frame} returned from
+#' \code{spacy_tokenize(x)} with default options.}
+#' \item{\code{root_id}}{serial number ID of root token}
+#' \item{\code{length}}{number of words (tokens) included in a noun-phrase (e.g.
+#' for a noun-phrase, "individual car owners", \code{length = 3})}}
+#' 
+#' @return either a \code{list} or \code{data.frame} of tokens
+#' @export
+#' @examples
+#' \donttest{
+#' spacy_initialize()
+#' 
+#' txt <- c(doc1 = "Natural language processing is a branch of computer science.",
+#'          doc2 = "Paul earned a postgraduate degree from MIT.")
+#' spacy_extract_nounphrases(txt)
+#' spacy_extract_nounphrases(txt, output = "list")
+#' }
+spacy_extract_nounphrases <- function(x, output = c("data.frame", "list"),
+                                      multithread = TRUE, ...) {
+    UseMethod("spacy_extract_nounphrases")
+}
+
+
+#' @export
+#' @importFrom data.table data.table
+#' @noRd
+spacy_extract_nounphrases.character <- function(x, 
+                                                output = c("data.frame", "list"),
+                                                multithread = TRUE, ...) {
+
+    `:=` <- NULL
+
+    output <- match.arg(output)
+
+    if (!is.null(names(x))) {
+        docnames <- names(x) 
+    } else {
+        docnames <- paste0("text", 1:length(x))
+    }
+    if(length(x) == 1) {
+        multithread <- FALSE
+    }
+
+    if (all(!duplicated(docnames)) == FALSE) {
+        stop("Docmanes are duplicated.")
+    } else if (all(nchar(docnames) > 0L) == FALSE) {
+        stop("Some docnames are missing.")
+    }
+
+    if (is.null(options()$spacy_initialized)) spacy_initialize()
+    spacyr_pyexec("try:\n del spobj\nexcept NameError:\n 1")
+    spacyr_pyexec("texts = []")
+
+    if (spacyr_pyget("py_version") != 3) {
+        message("multithreading for python 2 is not supported by spacy_tokenize()")
+        multithread <- FALSE
+    }
+
+
+    x <- gsub("\\\\n","\\\n", x) # replace two quotes \\n with \n
+    x <- gsub("\\\\t","\\\t", x) # replace two quotes \\t with \t
+    x <- gsub("\\\\","", x) # delete unnecessary backslashes
+    x <- unname(x)
+
+    ## send documents to python
+    spacyr_pyassign("texts", x)
+    spacyr_pyassign("docnames", docnames)
+    spacyr_pyassign("multithread", multithread)
+
+
+    ## run noun phrase extraction
+    spacyr_pyexec("spobj = spacyr()")
+    if (identical(output, "list")) {
+        command_str <- paste("noun_phrases = spobj.extract_nounphrases_list(texts = texts,",
+                             "docnames = docnames,",
+                             "multithread = multithread)")
+        spacyr_pyexec(command_str)
+        return(spacyr_pyget("noun_phrases"))
+    } else {
+        command_str <- paste("noun_phrases = spobj.extract_nounphrases_dataframe(texts = texts,",
+                             "docnames = docnames,",
+                             "multithread = multithread)")
+        spacyr_pyexec(command_str)
+        noun_phrases <- spacyr_pyget("noun_phrases")
+
+        doc_id <- names(noun_phrases)
+        data_out <- 
+            data.table::rbindlist(lapply(doc_id, function(x) {
+                df <- as.data.frame(noun_phrases[[x]], stringsAsFactors = FALSE)
+                df$doc_id <- x
+                return(df)
+            }))
+        data_out[, start_id := start_id + 1][, root_id := root_id + 1]
+        data.table::setDF(data_out)
+        data_out <- data_out[, c(6, 1:5)]
+        return(data_out)
+    }
+}
+
+
+#' @noRd
+#' @export
+spacy_extract_nounphrases.data.frame <- function(x, ...) {
+
+    # insert compliance check here - replace with tif package
+    if (!all(c("doc_id", "text") %in% names(x)))
+        stop("input data.frame does not conform to the TIF standard")
+
+    txt <- x$text
+    names(txt) <- x$doc_id
+    spacy_extract_nounphrases(txt, ...)
+}
diff --git a/R/spacy_parse.R b/R/spacy_parse.R
@@ -22,7 +22,8 @@
 #' @param entity logical; if \code{TRUE}, report named entities
 #' @param multithread logical; If true, the processing is parallelized using pipe 
 #'   functionality of spacy (\url{https://spacy.io/api/pipe}). 
-#' @param dependency logical; if \code{TRUE}, analyze and return dependencies
+#' @param dependency logical; if \code{TRUE}, analyze and return dependency tags
+#' @param nounphrase logical; if \code{TRUE}, analyze and return noun phrases tags
 #' @param ... not used directly
 #' @return a \code{data.frame} of tokenized, parsed, and annotated tokens
 #' @export
@@ -39,28 +40,33 @@
 #'           doc2 = "This is the second document.",
 #'           doc3 = "This is a \\\"quoted\\\" text." )
 #' spacy_parse(txt2, entity = TRUE, dependency = TRUE)
+#' 
+#' txt3 <- "We analyzed the Supreme Court using natural language processing." 
+#' sp3 <- spacy_parse(txt3, entity = TRUE, nounphrase = TRUE)
 #' }
 spacy_parse <- function(x, 
                         pos = TRUE,
                         tag = FALSE,
                         lemma = TRUE,
                         entity = TRUE, 
                         dependency = FALSE,
+                        nounphrase = FALSE,
                         multithread = TRUE,
                         ...) {
     UseMethod("spacy_parse")
 }
 
 
 #' @export
-#' @importFrom data.table data.table
+#' @importFrom data.table data.table setDT setnames
 #' @noRd
 spacy_parse.character <- function(x, 
                                   pos = TRUE,
                                   tag = FALSE,
                                   lemma = TRUE,
                                   entity = TRUE, 
                                   dependency = FALSE,
+                                  nounphrase = FALSE,
                                   multithread = TRUE,
                                   ...) {
 
@@ -118,6 +124,21 @@ spacy_parse.character <- function(x,
         dt[, entity := get_named_entities(spacy_out)]
     }
 
+    ## noun phrases
+    if (nounphrase) {
+        dt_nounphrases <- data.table::setDT(get_noun_phrases(spacy_out))
+        dt_nounphrases <- dt_nounphrases[rep(1:nrow(dt_nounphrases), times=length)]
+        dt_nounphrases[, w_id := seq(start_id[1], length.out = length[1]), by = .(doc_id, start_id)]
+        dt[, w_id := seq_len(.N), by = doc_id]
+        dt <- merge(dt, dt_nounphrases, by  = c("doc_id", "w_id"), all.x = TRUE)
+        dt[ !is.na(start_id), start_token_id := token_id[w_id == start_id][1],
+            by = .(doc_id, root_id)]
+        dt[ !is.na(start_id), root_token_id := token_id[w_id == root_id][1],
+            by = .(doc_id, root_id)]
+        dt[, c("w_id", "start_id", "root_id") := NULL]    
+        setnames(dt, c("text", "root_text", "length"), c("nounphrase", "nounphrase_root_text", "nounphrase_length"))
+    }
+
     dt <- as.data.frame(dt)
     class(dt) <- c("spacyr_parsed", class(dt))
     return(dt)
@@ -171,6 +192,11 @@ process_document <- function(x, multithread, ...) {
     } else {
         docnames <- paste0("text", 1:length(x))
     }
+    if(all(!duplicated(docnames)) == FALSE) {
+        stop("Docmanes are duplicated.")
+    } else if (all(nchar(docnames) > 0L) == FALSE) {
+        stop("Some docnames are missing.")
+    }
 
     if (is.null(options()$spacy_initialized)) spacy_initialize()
     spacyr_pyexec("try:\n del spobj\nexcept NameError:\n 1")