Skip to content

Commit

Permalink
Merge pull request #135 from quanteda/revert-119-tokenize-function
Browse files Browse the repository at this point in the history
Revert "Tokenize and noun-phrase extraction"
  • Loading branch information
amatsuo authored Nov 12, 2018
2 parents 0e21059 + 0ca67c0 commit 86d2c03
Show file tree
Hide file tree
Showing 17 changed files with 22 additions and 948 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ Suggests:
URL: http://github.com/quanteda/spacyr
Encoding: UTF-8
BugReports: https://github.com/quanteda/spacyr/issues
RoxygenNote: 6.1.1
RoxygenNote: 6.1.0
9 changes: 0 additions & 9 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,29 @@

S3method(entity_consolidate,spacyr_parsed)
S3method(entity_extract,spacyr_parsed)
S3method(spacy_extract_nounphrases,character)
S3method(spacy_extract_nounphrases,data.frame)
S3method(spacy_parse,character)
S3method(spacy_parse,data.frame)
S3method(spacy_tokenize,character)
S3method(spacy_tokenize,data.frame)
export(entity_consolidate)
export(entity_extract)
export(find_spacy)
export(find_spacy_env)
export(get_attrs)
export(get_dependency)
export(get_named_entities)
export(get_noun_phrases)
export(get_ntokens)
export(get_ntokens_by_sent)
export(get_tags)
export(get_tokens)
export(process_document)
export(spacy_download_langmodel)
export(spacy_download_langmodel_virtualenv)
export(spacy_extract_nounphrases)
export(spacy_finalize)
export(spacy_initialize)
export(spacy_install)
export(spacy_install_virtualenv)
export(spacy_parse)
export(spacy_tokenize)
export(spacy_uninstall)
export(spacy_upgrade)
importFrom(data.table,as.data.table)
importFrom(data.table,data.table)
importFrom(data.table,setDT)
importFrom(data.table,setnames)
importFrom(methods,new)
28 changes: 0 additions & 28 deletions R/parse-extractor-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -123,34 +123,6 @@ get_dependency <- function(spacy_out) {
return(list(head_id = head_id, dep_rel = dep_rel))
}


#' @rdname get-functions
#' @return \code{get_noun_phrases} returns a data.frame of noun phrases.
#' @export
#' @keywords internal
get_noun_phrases <- function(spacy_out) {
# get ids of head of each token
spacyr_pyassign("timestamps", spacy_out$timestamps)
spacyr_pyassign("docnames", spacy_out$docnames)
command_str <- paste("noun_phrases = spobj.extract_nounphrases_dataframe(timestamps = timestamps,",
"docnames = docnames,",
"multithread = False)")
spacyr_pyexec(command_str)
noun_phrases <- spacyr_pyget("noun_phrases")

doc_id <- names(noun_phrases)
data_out <-
data.table::rbindlist(lapply(doc_id, function(x) {
df <- as.data.frame(noun_phrases[[x]], stringsAsFactors = FALSE)
df$doc_id <- x
return(df)
}))
data_out[, start_id := start_id + 1][, root_id := root_id + 1]
data.table::setDF(data_out)
data_out <- data_out[, c(6, 1:5)]
return(data_out)
}

#' @rdname get-functions
#' @return \code{get_ntokens} returns a data.frame of dependency relations
#' @export
Expand Down
130 changes: 0 additions & 130 deletions R/spacy_extract_nounphrases.R

This file was deleted.

30 changes: 2 additions & 28 deletions R/spacy_parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@
#' @param entity logical; if \code{TRUE}, report named entities
#' @param multithread logical; If true, the processing is parallelized using pipe
#' functionality of spacy (\url{https://spacy.io/api/pipe}).
#' @param dependency logical; if \code{TRUE}, analyze and return dependency tags
#' @param nounphrase logical; if \code{TRUE}, analyze and return noun phrases tags
#' @param dependency logical; if \code{TRUE}, analyze and return dependencies
#' @param ... not used directly
#' @return a \code{data.frame} of tokenized, parsed, and annotated tokens
#' @export
Expand All @@ -40,33 +39,28 @@
#' doc2 = "This is the second document.",
#' doc3 = "This is a \\\"quoted\\\" text." )
#' spacy_parse(txt2, entity = TRUE, dependency = TRUE)
#'
#' txt3 <- "We analyzed the Supreme Court using natural language processing."
#' sp3 <- spacy_parse(txt3, entity = TRUE, nounphrase = TRUE)
#' }
spacy_parse <- function(x,
pos = TRUE,
tag = FALSE,
lemma = TRUE,
entity = TRUE,
dependency = FALSE,
nounphrase = FALSE,
multithread = TRUE,
...) {
UseMethod("spacy_parse")
}


#' @export
#' @importFrom data.table data.table setDT setnames
#' @importFrom data.table data.table
#' @noRd
spacy_parse.character <- function(x,
pos = TRUE,
tag = FALSE,
lemma = TRUE,
entity = TRUE,
dependency = FALSE,
nounphrase = FALSE,
multithread = TRUE,
...) {

Expand Down Expand Up @@ -124,21 +118,6 @@ spacy_parse.character <- function(x,
dt[, entity := get_named_entities(spacy_out)]
}

## noun phrases
if (nounphrase) {
dt_nounphrases <- data.table::setDT(get_noun_phrases(spacy_out))
dt_nounphrases <- dt_nounphrases[rep(1:nrow(dt_nounphrases), times=length)]
dt_nounphrases[, w_id := seq(start_id[1], length.out = length[1]), by = .(doc_id, start_id)]
dt[, w_id := seq_len(.N), by = doc_id]
dt <- merge(dt, dt_nounphrases, by = c("doc_id", "w_id"), all.x = TRUE)
dt[ !is.na(start_id), start_token_id := token_id[w_id == start_id][1],
by = .(doc_id, root_id)]
dt[ !is.na(start_id), root_token_id := token_id[w_id == root_id][1],
by = .(doc_id, root_id)]
dt[, c("w_id", "start_id", "root_id") := NULL]
setnames(dt, c("text", "root_text", "length"), c("nounphrase", "nounphrase_root_text", "nounphrase_length"))
}

dt <- as.data.frame(dt)
class(dt) <- c("spacyr_parsed", class(dt))
return(dt)
Expand Down Expand Up @@ -192,11 +171,6 @@ process_document <- function(x, multithread, ...) {
} else {
docnames <- paste0("text", 1:length(x))
}
if(all(!duplicated(docnames)) == FALSE) {
stop("Docmanes are duplicated.")
} else if (all(nchar(docnames) > 0L) == FALSE) {
stop("Some docnames are missing.")
}

if (is.null(options()$spacy_initialized)) spacy_initialize()
spacyr_pyexec("try:\n del spobj\nexcept NameError:\n 1")
Expand Down
Loading

0 comments on commit 86d2c03

Please sign in to comment.