Skip to content

Commit

Permalink
feat(textprocessing): avoid unnecessary language detection
Browse files Browse the repository at this point in the history
  • Loading branch information
guo-yong-zhi committed Oct 16, 2024
1 parent 54c8687 commit 32fb2df
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 16 deletions.
33 changes: 20 additions & 13 deletions src/textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ Count words in text. And save results into `counter`.
`regexp` is a regular expression to partially match and filter words. For example, `regexp=r"\S(?:[\s\S]*\S)?"` will trim whitespaces then eliminate empty words.
"""
function countwords(words, counts; lemmatizer=:auto, language=:auto,
regexp=r"(?:\S[\s\S]*)?[^0-9_\W](?:[\s\S]*\S)?", counter=Dict{String,Int}())
regexp=r"(?:\S[\s\S]*)?[^0-9_\W](?:[\s\S]*\S)?", counter=Dict{String,Int}(), return_language=false)
# strip whitespace and filter out pure punctuation and number string
for (w, c) in zip(words, counts)
if regexp !== nothing
Expand All @@ -117,7 +117,7 @@ function countwords(words, counts; lemmatizer=:auto, language=:auto,
lemmatizer = get(LEMMATIZERS, language, LEMMATIZERS["_default_"])
end
lemmatize!(counter, lemmatizer)
counter
return_language ? (counter, language) : counter
end
function countwords(text::AbstractString; tokenizer=:auto, language=:auto, kargs...)
if tokenizer == :auto
Expand All @@ -135,11 +135,14 @@ countwords(words; kargs...) = countwords(words, Iterators.repeated(1); kargs...)
function countwords(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}}; kargs...)
countwords(first.(counter), [v[2] for v in counter]; kargs...)
end
function countwords(textfile::IO; counter=Dict{String,Int}(), kargs...)
function countwords(textfile::IO; counter=Dict{String,Int}(), lemmatizer=:auto, tokenizer=:auto, language=:auto, return_language=false, kargs...)
if lemmatizer == :auto || tokenizer == :auto # detect language globally
language = detect_language(textfile, language)
end
for l in eachline(textfile)
countwords(l; counter=counter, kargs...)
countwords(l; counter=counter, language=language, return_language=false, kargs...)
end
counter
return_language ? (counter, language) : counter
end

function casemerge!(d)
Expand Down Expand Up @@ -234,16 +237,20 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
minfrequency=0,
maxnum=500,
minweight=:auto, maxweight=:auto,
process=rescaleweights(identity, 0) casemerge!)
process=rescaleweights(identity, 0) casemerge!,
return_language=false)

language = detect_language(keys(counter), language)
if !haskey(STOPWORDS, language)
@info "No built-in stopwords for $(language)!"
if stopwords == :auto
language = detect_language(keys(counter), language)
if !haskey(STOPWORDS, language)
@info "No built-in stopwords for $(language)!"
end
stopwords = get(STOPWORDS, language, nothing)
end
stopwords == :auto && (stopwords = get(STOPWORDS, language, nothing))
stopwords === nothing && (stopwords = Set{String}())
stopwords isa AbstractSet || (stopwords = Set(stopwords))
stopwords_extra === nothing || (stopwords = stopwords stopwords_extra)

counter = process(counter)
print("Total words: $(round(sum(values(counter)), digits=2)). ")
print("Unique words: $(length(counter)). ")
Expand Down Expand Up @@ -276,14 +283,14 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
print("The weights of the biggest $nhuge words have been reduced.")
end
print("\n")
words, weights
return_language ? ((words, weights), language) : (words, weights)
end

function processtext(text; language=:auto, kargs...)
language = detect_language(text, language)
cwkw = (:counter, :regexp, :tokenizer, :lemmatizer)
counter, language = countwords(text; language=language, filter(kw -> first(kw) cwkw, kargs)..., return_language=true)
processtext(
countwords(text; language=language, filter(kw -> first(kw) cwkw, kargs)...);
counter;
language=language,
filter(kw -> first(kw) cwkw, kargs)...)
end
Expand Down
4 changes: 2 additions & 2 deletions src/wc-class.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ wordcloud(wordsweights::Tuple; kargs...) = wordcloud(wordsweights...; kargs...)
wordcloud(counter::AbstractDict; kargs...) = wordcloud(keys(counter) |> collect, values(counter) |> collect; kargs...)
wordcloud(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}}; kargs...) = wordcloud(first.(counter), [v[2] for v in counter]; kargs...)
function wordcloud(text; language=:auto, stopwords=:auto, stopwords_extra=nothing, maxnum=500, kargs...)
language = detect_language(text, language)
wordcloud(processtext(text, language=language, stopwords=stopwords, stopwords_extra=stopwords_extra, maxnum=maxnum); language=language, kargs...)
words_weights, language = processtext(text, language=language, stopwords=stopwords, stopwords_extra=stopwords_extra, maxnum=maxnum, return_language=true)
wordcloud(words_weights; language=language, kargs...)
end
wordcloud(words, weight::Number; kargs...) = wordcloud(words, repeat([weight], length(words)); kargs...)
function wordcloud(words::AbstractVector{<:AbstractString}, weights::AbstractVector{<:Real};
Expand Down
2 changes: 1 addition & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ include("test_textprocessing.jl")
paint(wc, "test.jpg", background=outline(wc.mask, color=(1, 0, 0.2, 0.7), linewidth=2), ratio=0.5)
paint(wc, "test.svg", background=WordCloud.tobitmap(wc.mask))
paint(wc, "test.svg")
paintsvgcloud("holly bible", "test.svg")
paintsvgcloud("holly bible", "test.svg", quiet=false)
paintcloud("holly bible", angles=(0, 90), ratio=0.5)
show(wc)
@test getparameter(wc, :volume) == WordCloud.occupancy(WordCloud.QTrees.kernel(wc.maskqtree[1]), WordCloud.QTrees.FULL)
Expand Down

0 comments on commit 32fb2df

Please sign in to comment.