diff --git a/Project.toml b/Project.toml index 7dd6206..b6f65e8 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "WordCloud" uuid = "6385f0a0-cb03-45b6-9089-4e0acc74b26b" authors = ["guoyongzhi "] -version = "1.2.2" +version = "1.3.0" [deps] ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4" diff --git a/src/textprocessing.jl b/src/textprocessing.jl index 8f1c39c..d952689 100644 --- a/src/textprocessing.jl +++ b/src/textprocessing.jl @@ -47,12 +47,12 @@ function lemmatize!(d::AbstractDict, lemmatizer) end function tokenizer(text::AbstractString, regexp=r"\w+") - [text[i] for i in findall(regexp, text)] + (m.match for m in eachmatch(regexp, text)) end -function tokenizer_eng(text::AbstractString, regexp=r"\b\w+(?:'\w+)*\b") - indices = findall(regexp, text) - [endswith(text[i], "'s") ? text[i][1:prevind(text[i], end, 2)] : text[i] for i in indices] +function tokenizer_eng(text::AbstractString, regexp=r"\b[\w']+\b") + ms = eachmatch(regexp, text) + (endswith(m.match, "'s") ? m.match[1:prevind(m.match, end, 2)] : m.match for m in ms) end # ISO 639-3 macrolanguages @@ -98,10 +98,9 @@ Count words in text. And save results into `counter`. `text_or_counter` can be a String, a Vector of Strings, an opend file (IO) or a Dict. `regexp` is a regular expression to partially match and filter words. For example, `regexp=r"\S(?:[\s\S]*\S)?"` will trim whitespaces then eliminate empty words. """ -function countwords(words, counts; language=:auto, +function countwords(words, counts; lemmatizer=:auto, language=:auto, regexp=r"(?:\S[\s\S]*)?[^0-9_\W](?:[\s\S]*\S)?", counter=Dict{String,Int}()) # strip whitespace and filter out pure punctuation and number string - language = detect_language(words, language) for (w, c) in zip(words, counts) if regexp !== nothing m = match(regexp, w) @@ -113,21 +112,26 @@ function countwords(words, counts; language=:auto, counter[w] = get(counter, w, 0) + c end end - lemmatizer_ = get(LEMMATIZERS, language, LEMMATIZERS["_default_"]) - lemmatize!(counter, lemmatizer_) + if lemmatizer == :auto + language = detect_language(words, language) + lemmatizer = get(LEMMATIZERS, language, LEMMATIZERS["_default_"]) + end + lemmatize!(counter, lemmatizer) counter end -function countwords(text::AbstractString; language=:auto, kargs...) - language = detect_language(text, language) - if !haskey(TOKENIZERS, language) - @warn "No built-in tokenizer for $(language)!" +function countwords(text::AbstractString; tokenizer=:auto, language=:auto, kargs...) + if tokenizer == :auto + language = detect_language(text, language) + if !haskey(TOKENIZERS, language) + @info "No dedicated built-in tokenizer for $(language); using basic tokenizer instead" + end + tokenizer = get(TOKENIZERS, language, TOKENIZERS["_default_"]) end - tokenizer_ = get(TOKENIZERS, language, TOKENIZERS["_default_"]) - countwords(tokenizer_(text); language=language, kargs...) + countwords(tokenizer(text); language=language, kargs...) end -countwords(words::AbstractVector{<:AbstractString}; kargs...) = countwords(words, Iterators.repeated(1); kargs...) countwords(counter::AbstractDict{<:AbstractString,<:Real}; kargs...) = countwords(keys(counter), values(counter); kargs...) countwords(wordscounts::Tuple; kargs...) = countwords(wordscounts...; kargs...) +countwords(words; kargs...) = countwords(words, Iterators.repeated(1); kargs...) function countwords(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}}; kargs...) countwords(first.(counter), [v[2] for v in counter]; kargs...) end @@ -234,7 +238,7 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real}; language = detect_language(keys(counter), language) if !haskey(STOPWORDS, language) - @warn "No built-in stopwords for $(language)!" + @info "No built-in stopwords for $(language)!" end stopwords == :auto && (stopwords = get(STOPWORDS, language, nothing)) stopwords === nothing && (stopwords = Set{String}()) @@ -277,7 +281,7 @@ end function processtext(text; language=:auto, kargs...) language = detect_language(text, language) - cwkw = (:counter, :regexp) + cwkw = (:counter, :regexp, :tokenizer, :lemmatizer) processtext( countwords(text; language=language, filter(kw -> first(kw) ∈ cwkw, kargs)...); language=language, diff --git a/test/test_textprocessing.jl b/test/test_textprocessing.jl index 615c16a..87430cd 100644 --- a/test/test_textprocessing.jl +++ b/test/test_textprocessing.jl @@ -9,6 +9,19 @@ words, weights = WordCloud.TextProcessing.processtext(c) @test !("to" in words) # stopwords + tokenizer_eng = WordCloud.TextProcessing.tokenizer_eng + tokenizer_default = WordCloud.TextProcessing.tokenizer + @test tokenizer_default(" a man の 书本\n 1234") .|> strip == ["a", "man", "の", "书本", "1234"] + @test tokenizer_eng(" a book in 1994\n") .|> strip == ["a", "book", "in", "1994"] + @test tokenizer_eng(" the 'best-book' in 1994\n") .|> strip == ["the", "best", "book", "in", "1994"] + @test tokenizer_eng("")|>collect == tokenizer_eng(" ")|>collect == tokenizer_eng(" ,")|>collect == [] + @test tokenizer_eng(" a _int_var3") .|> strip == ["a", "_int_var3"] + @test tokenizer_eng("bob's book") .|> strip == ["bob", "book"] + @test tokenizer_eng("bob's 'book' 'book'") .|> strip == ["bob", "book", "book"] + @test tokenizer_eng("abc'de fg'h'ij k'l") .|> strip == ["abc'de", "fg'h'ij", "k'l"] + @test tokenizer_eng("abc'de', fg'h'ij' k'l'") .|> strip == ["abc'de", "fg'h'ij", "k'l"] + @test tokenizer_eng(" abc'de'. fg'h'ij',k'l'") .|> strip == ["abc'de", "fg'h'ij", "k'l"] + lemmatizer_eng = WordCloud.TextProcessing.lemmatizer_eng lemmatize! = WordCloud.TextProcessing.lemmatize! @test lemmatizer_eng("Cars") == "Car"