Skip to content

Commit

Permalink
Merge pull request #119 from quanteda/tokenize-function
Browse files Browse the repository at this point in the history
Tokenize and noun-phrase extraction
  • Loading branch information
kbenoit authored Nov 12, 2018
2 parents 186eb12 + 586a55f commit 0e21059
Show file tree
Hide file tree
Showing 17 changed files with 948 additions and 22 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ Suggests:
URL: http://github.com/quanteda/spacyr
Encoding: UTF-8
BugReports: https://github.com/quanteda/spacyr/issues
RoxygenNote: 6.1.0
RoxygenNote: 6.1.1
9 changes: 9 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,38 @@

S3method(entity_consolidate,spacyr_parsed)
S3method(entity_extract,spacyr_parsed)
S3method(spacy_extract_nounphrases,character)
S3method(spacy_extract_nounphrases,data.frame)
S3method(spacy_parse,character)
S3method(spacy_parse,data.frame)
S3method(spacy_tokenize,character)
S3method(spacy_tokenize,data.frame)
export(entity_consolidate)
export(entity_extract)
export(find_spacy)
export(find_spacy_env)
export(get_attrs)
export(get_dependency)
export(get_named_entities)
export(get_noun_phrases)
export(get_ntokens)
export(get_ntokens_by_sent)
export(get_tags)
export(get_tokens)
export(process_document)
export(spacy_download_langmodel)
export(spacy_download_langmodel_virtualenv)
export(spacy_extract_nounphrases)
export(spacy_finalize)
export(spacy_initialize)
export(spacy_install)
export(spacy_install_virtualenv)
export(spacy_parse)
export(spacy_tokenize)
export(spacy_uninstall)
export(spacy_upgrade)
importFrom(data.table,as.data.table)
importFrom(data.table,data.table)
importFrom(data.table,setDT)
importFrom(data.table,setnames)
importFrom(methods,new)
28 changes: 28 additions & 0 deletions R/parse-extractor-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,34 @@ get_dependency <- function(spacy_out) {
return(list(head_id = head_id, dep_rel = dep_rel))
}


#' @rdname get-functions
#' @return \code{get_noun_phrases} returns a data.frame of noun phrases.
#' @export
#' @keywords internal
get_noun_phrases <- function(spacy_out) {
# get ids of head of each token
spacyr_pyassign("timestamps", spacy_out$timestamps)
spacyr_pyassign("docnames", spacy_out$docnames)
command_str <- paste("noun_phrases = spobj.extract_nounphrases_dataframe(timestamps = timestamps,",
"docnames = docnames,",
"multithread = False)")
spacyr_pyexec(command_str)
noun_phrases <- spacyr_pyget("noun_phrases")

doc_id <- names(noun_phrases)
data_out <-
data.table::rbindlist(lapply(doc_id, function(x) {
df <- as.data.frame(noun_phrases[[x]], stringsAsFactors = FALSE)
df$doc_id <- x
return(df)
}))
data_out[, start_id := start_id + 1][, root_id := root_id + 1]
data.table::setDF(data_out)
data_out <- data_out[, c(6, 1:5)]
return(data_out)
}

#' @rdname get-functions
#' @return \code{get_ntokens} returns a data.frame of dependency relations
#' @export
Expand Down
130 changes: 130 additions & 0 deletions R/spacy_extract_nounphrases.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#' Extract noun phrases from a text
#'
#' This function extracts noun-phrases from documents, based on the
#' \code{noun_chunks} attributes of documents objects parsed by spaCy (see
#' \url{https://spacy.io/usage/linguistic-features#noun-chunks}).
#'
#' @param x a character object or a TIF-compliant
#' corpus data.frame (see \url{https://github.com/ropensci/tif})
#' @param multithread logical; If true, the processing is parallelized using pipe
#' functionality of spacy (\url{https://spacy.io/api/pipe}).
#' @param output type of returned object, either \code{"list"} or
#' \code{"data.frame"}.
#' @param ... unused
#' @details When the option \code{output = "data.frame"} is selected, the
#' function returns a \code{data.frame} with the following fields.
#' \describe{\item{\code{text}}{contents of noun-phrase}
#' \item{\code{root_text}}{contents of root token}
#' \item{\code{start_id}}{serial number ID of starting token. This number
#' corresponds with the number of \code{data.frame} returned from
#' \code{spacy_tokenize(x)} with default options.}
#' \item{\code{root_id}}{serial number ID of root token}
#' \item{\code{length}}{number of words (tokens) included in a noun-phrase (e.g.
#' for a noun-phrase, "individual car owners", \code{length = 3})}}
#'
#' @return either a \code{list} or \code{data.frame} of tokens
#' @export
#' @examples
#' \donttest{
#' spacy_initialize()
#'
#' txt <- c(doc1 = "Natural language processing is a branch of computer science.",
#' doc2 = "Paul earned a postgraduate degree from MIT.")
#' spacy_extract_nounphrases(txt)
#' spacy_extract_nounphrases(txt, output = "list")
#' }
spacy_extract_nounphrases <- function(x, output = c("data.frame", "list"),
multithread = TRUE, ...) {
UseMethod("spacy_extract_nounphrases")
}


#' @export
#' @importFrom data.table data.table
#' @noRd
spacy_extract_nounphrases.character <- function(x,
output = c("data.frame", "list"),
multithread = TRUE, ...) {

`:=` <- NULL

output <- match.arg(output)

if (!is.null(names(x))) {
docnames <- names(x)
} else {
docnames <- paste0("text", 1:length(x))
}
if(length(x) == 1) {
multithread <- FALSE
}

if (all(!duplicated(docnames)) == FALSE) {
stop("Docmanes are duplicated.")
} else if (all(nchar(docnames) > 0L) == FALSE) {
stop("Some docnames are missing.")
}

if (is.null(options()$spacy_initialized)) spacy_initialize()
spacyr_pyexec("try:\n del spobj\nexcept NameError:\n 1")
spacyr_pyexec("texts = []")

if (spacyr_pyget("py_version") != 3) {
message("multithreading for python 2 is not supported by spacy_tokenize()")
multithread <- FALSE
}


x <- gsub("\\\\n","\\\n", x) # replace two quotes \\n with \n
x <- gsub("\\\\t","\\\t", x) # replace two quotes \\t with \t
x <- gsub("\\\\","", x) # delete unnecessary backslashes
x <- unname(x)

## send documents to python
spacyr_pyassign("texts", x)
spacyr_pyassign("docnames", docnames)
spacyr_pyassign("multithread", multithread)


## run noun phrase extraction
spacyr_pyexec("spobj = spacyr()")
if (identical(output, "list")) {
command_str <- paste("noun_phrases = spobj.extract_nounphrases_list(texts = texts,",
"docnames = docnames,",
"multithread = multithread)")
spacyr_pyexec(command_str)
return(spacyr_pyget("noun_phrases"))
} else {
command_str <- paste("noun_phrases = spobj.extract_nounphrases_dataframe(texts = texts,",
"docnames = docnames,",
"multithread = multithread)")
spacyr_pyexec(command_str)
noun_phrases <- spacyr_pyget("noun_phrases")

doc_id <- names(noun_phrases)
data_out <-
data.table::rbindlist(lapply(doc_id, function(x) {
df <- as.data.frame(noun_phrases[[x]], stringsAsFactors = FALSE)
df$doc_id <- x
return(df)
}))
data_out[, start_id := start_id + 1][, root_id := root_id + 1]
data.table::setDF(data_out)
data_out <- data_out[, c(6, 1:5)]
return(data_out)
}
}


#' @noRd
#' @export
spacy_extract_nounphrases.data.frame <- function(x, ...) {

# insert compliance check here - replace with tif package
if (!all(c("doc_id", "text") %in% names(x)))
stop("input data.frame does not conform to the TIF standard")

txt <- x$text
names(txt) <- x$doc_id
spacy_extract_nounphrases(txt, ...)
}
30 changes: 28 additions & 2 deletions R/spacy_parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
#' @param entity logical; if \code{TRUE}, report named entities
#' @param multithread logical; If true, the processing is parallelized using pipe
#' functionality of spacy (\url{https://spacy.io/api/pipe}).
#' @param dependency logical; if \code{TRUE}, analyze and return dependencies
#' @param dependency logical; if \code{TRUE}, analyze and return dependency tags
#' @param nounphrase logical; if \code{TRUE}, analyze and return noun phrases tags
#' @param ... not used directly
#' @return a \code{data.frame} of tokenized, parsed, and annotated tokens
#' @export
Expand All @@ -39,28 +40,33 @@
#' doc2 = "This is the second document.",
#' doc3 = "This is a \\\"quoted\\\" text." )
#' spacy_parse(txt2, entity = TRUE, dependency = TRUE)
#'
#' txt3 <- "We analyzed the Supreme Court using natural language processing."
#' sp3 <- spacy_parse(txt3, entity = TRUE, nounphrase = TRUE)
#' }
spacy_parse <- function(x,
pos = TRUE,
tag = FALSE,
lemma = TRUE,
entity = TRUE,
dependency = FALSE,
nounphrase = FALSE,
multithread = TRUE,
...) {
UseMethod("spacy_parse")
}


#' @export
#' @importFrom data.table data.table
#' @importFrom data.table data.table setDT setnames
#' @noRd
spacy_parse.character <- function(x,
pos = TRUE,
tag = FALSE,
lemma = TRUE,
entity = TRUE,
dependency = FALSE,
nounphrase = FALSE,
multithread = TRUE,
...) {

Expand Down Expand Up @@ -118,6 +124,21 @@ spacy_parse.character <- function(x,
dt[, entity := get_named_entities(spacy_out)]
}

## noun phrases
if (nounphrase) {
dt_nounphrases <- data.table::setDT(get_noun_phrases(spacy_out))
dt_nounphrases <- dt_nounphrases[rep(1:nrow(dt_nounphrases), times=length)]
dt_nounphrases[, w_id := seq(start_id[1], length.out = length[1]), by = .(doc_id, start_id)]
dt[, w_id := seq_len(.N), by = doc_id]
dt <- merge(dt, dt_nounphrases, by = c("doc_id", "w_id"), all.x = TRUE)
dt[ !is.na(start_id), start_token_id := token_id[w_id == start_id][1],
by = .(doc_id, root_id)]
dt[ !is.na(start_id), root_token_id := token_id[w_id == root_id][1],
by = .(doc_id, root_id)]
dt[, c("w_id", "start_id", "root_id") := NULL]
setnames(dt, c("text", "root_text", "length"), c("nounphrase", "nounphrase_root_text", "nounphrase_length"))
}

dt <- as.data.frame(dt)
class(dt) <- c("spacyr_parsed", class(dt))
return(dt)
Expand Down Expand Up @@ -171,6 +192,11 @@ process_document <- function(x, multithread, ...) {
} else {
docnames <- paste0("text", 1:length(x))
}
if(all(!duplicated(docnames)) == FALSE) {
stop("Docmanes are duplicated.")
} else if (all(nchar(docnames) > 0L) == FALSE) {
stop("Some docnames are missing.")
}

if (is.null(options()$spacy_initialized)) spacy_initialize()
spacyr_pyexec("try:\n del spobj\nexcept NameError:\n 1")
Expand Down
Loading

0 comments on commit 0e21059

Please sign in to comment.