Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

normstrngs: add more hyphens and quotes #4195

Merged
merged 2 commits into from
Mar 5, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 20 additions & 9 deletions src/training/unicharset/normstrngs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,22 @@ namespace tesseract {
static bool is_hyphen_punc(const char32 ch) {
static const int kNumHyphenPuncUnicodes = 13;
static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
'-', 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
0x207b, // superscript minus
0x208b, // subscript minus
0x2212, // minus sign
0xfe58, // small em dash
0xfe63, // small hyphen-minus
0xff0d, // fullwidth hyphen-minus
'-',
0x2010, // hyphen
0x2011, // non-breaking hyphen
0x2012, // figure dash
0x2013, // en dash
0x2014, // em dash
0x2015, // horizontal bar
// how about 0x2043 hyphen bullet?
// how about 0x2500 box drawings light horizontal?
0x207b, // superscript minus
0x208b, // subscript minus
0x2212, // minus sign
0xfe58, // small em dash
0xfe63, // small hyphen-minus
0xff0d, // fullwidth hyphen-minus
0x2e17 // double oblique hyphen (Fraktur)
};
for (int kHyphenPuncUnicode : kHyphenPuncUnicodes) {
if (kHyphenPuncUnicode == ch) {
Expand All @@ -61,10 +70,11 @@ static bool is_single_quote(const char32 ch) {
0x2018, // left single quotation mark (English, others)
0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
// We may have to introduce a comma set with 0x201a
0x201A, // single low-9 quotation mark (German)
0x201B, // single high-reversed-9 quotation mark (PropList.txt)
0x2032, // prime
0x300C, // left corner bracket (East Asian languages)
0xFF07, // fullwidth apostrophe
0xFF07 // fullwidth apostrophe
};
for (int kSingleQuoteUnicode : kSingleQuoteUnicodes) {
if (kSingleQuoteUnicode == ch) {
Expand All @@ -82,10 +92,11 @@ static bool is_double_quote(const char32 ch) {
0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
0x201F, // double high-reversed-9 quotation mark (PropList.txt)
0x2033, // double prime
0x201E, // double low-9 quotation mark (German)
0x301D, // reversed double prime quotation mark (East Asian langs,
// horiz.)
0x301E, // close double prime (East Asian languages written horizontally)
0xFF02, // fullwidth quotation mark
0xFF02 // fullwidth quotation mark
};
for (int kDoubleQuoteUnicode : kDoubleQuoteUnicodes) {
if (kDoubleQuoteUnicode == ch) {
Expand Down
Loading