Skip to content

Commit

Permalink
fix(textprocessing): improve English tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
guo-yong-zhi committed Oct 10, 2024
1 parent e57c419 commit 43ec72f
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion src/textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ function tokenizer(text::AbstractString, regexp=r"\w+")
[text[i] for i in findall(regexp, text)]
end

function tokenizer_eng(text::AbstractString, regexp=r"\w[\w']*")
function tokenizer_eng(text::AbstractString, regexp=r"\b\w+(?:'\w+)*\b")
indices = findall(regexp, text)
[endswith(text[i], "'s") ? text[i][1:prevind(text[i], end, 2)] : text[i] for i in indices]
end
Expand Down

0 comments on commit 43ec72f

Please sign in to comment.