From 319d8215f10cac30a60bad5e4213d505f24bde48 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 5 Mar 2024 12:53:00 +0100 Subject: [PATCH 1/2] normstrngs: add more hyphens and quotes --- src/training/unicharset/normstrngs.cpp | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/training/unicharset/normstrngs.cpp b/src/training/unicharset/normstrngs.cpp index 026feefd2e..930073592e 100644 --- a/src/training/unicharset/normstrngs.cpp +++ b/src/training/unicharset/normstrngs.cpp @@ -38,13 +38,22 @@ namespace tesseract { static bool is_hyphen_punc(const char32 ch) { static const int kNumHyphenPuncUnicodes = 13; static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = { - '-', 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar - 0x207b, // superscript minus - 0x208b, // subscript minus - 0x2212, // minus sign - 0xfe58, // small em dash - 0xfe63, // small hyphen-minus - 0xff0d, // fullwidth hyphen-minus + '-', + 0x2010, // hyphen + 0x2011, // non-breaking hyphen + 0x2012, // figure dash + 0x2013, // en dash + 0x2014, // em dash + 0x2015, // horizontal bar + // how about 0x2043 hyphen bullet? + // how about 0x2500 box drawings light horizontal? + 0x207b, // superscript minus + 0x208b, // subscript minus + 0x2212, // minus sign + 0xfe58, // small em dash + 0xfe63, // small hyphen-minus + 0xff0d, // fullwidth hyphen-minus + 0x2e17, // double oblique hyphen (Fraktur) }; for (int kHyphenPuncUnicode : kHyphenPuncUnicodes) { if (kHyphenPuncUnicode == ch) { @@ -61,6 +70,7 @@ static bool is_single_quote(const char32 ch) { 0x2018, // left single quotation mark (English, others) 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.) // We may have to introduce a comma set with 0x201a + 0x201A, // single low-9 quotation mark (German) 0x201B, // single high-reversed-9 quotation mark (PropList.txt) 0x2032, // prime 0x300C, // left corner bracket (East Asian languages) @@ -82,6 +92,7 @@ static bool is_double_quote(const char32 ch) { 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.) 0x201F, // double high-reversed-9 quotation mark (PropList.txt) 0x2033, // double prime + 0x201E, // double low-9 quotation mark (German) 0x301D, // reversed double prime quotation mark (East Asian langs, // horiz.) 0x301E, // close double prime (East Asian languages written horizontally) From 162714dbd8d9c34aedf47d55411023fbe894344c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 5 Mar 2024 13:09:31 +0100 Subject: [PATCH 2/2] fix initializer syntax --- src/training/unicharset/normstrngs.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/training/unicharset/normstrngs.cpp b/src/training/unicharset/normstrngs.cpp index 930073592e..22dbdaa437 100644 --- a/src/training/unicharset/normstrngs.cpp +++ b/src/training/unicharset/normstrngs.cpp @@ -53,7 +53,7 @@ static bool is_hyphen_punc(const char32 ch) { 0xfe58, // small em dash 0xfe63, // small hyphen-minus 0xff0d, // fullwidth hyphen-minus - 0x2e17, // double oblique hyphen (Fraktur) + 0x2e17 // double oblique hyphen (Fraktur) }; for (int kHyphenPuncUnicode : kHyphenPuncUnicodes) { if (kHyphenPuncUnicode == ch) { @@ -74,7 +74,7 @@ static bool is_single_quote(const char32 ch) { 0x201B, // single high-reversed-9 quotation mark (PropList.txt) 0x2032, // prime 0x300C, // left corner bracket (East Asian languages) - 0xFF07, // fullwidth apostrophe + 0xFF07 // fullwidth apostrophe }; for (int kSingleQuoteUnicode : kSingleQuoteUnicodes) { if (kSingleQuoteUnicode == ch) { @@ -96,7 +96,7 @@ static bool is_double_quote(const char32 ch) { 0x301D, // reversed double prime quotation mark (East Asian langs, // horiz.) 0x301E, // close double prime (East Asian languages written horizontally) - 0xFF02, // fullwidth quotation mark + 0xFF02 // fullwidth quotation mark }; for (int kDoubleQuoteUnicode : kDoubleQuoteUnicodes) { if (kDoubleQuoteUnicode == ch) {