From 319d8215f10cac30a60bad5e4213d505f24bde48 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Tue, 5 Mar 2024 12:53:00 +0100
Subject: [PATCH 1/2] normstrngs: add more hyphens and quotes

---
 src/training/unicharset/normstrngs.cpp | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/training/unicharset/normstrngs.cpp b/src/training/unicharset/normstrngs.cpp
index 026feefd2e..930073592e 100644
--- a/src/training/unicharset/normstrngs.cpp
+++ b/src/training/unicharset/normstrngs.cpp
@@ -38,13 +38,22 @@ namespace tesseract {
 static bool is_hyphen_punc(const char32 ch) {
   static const int kNumHyphenPuncUnicodes = 13;
   static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
-      '-',    0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
-      0x207b,                                                 // superscript minus
-      0x208b,                                                 // subscript minus
-      0x2212,                                                 // minus sign
-      0xfe58,                                                 // small em dash
-      0xfe63,                                                 // small hyphen-minus
-      0xff0d,                                                 // fullwidth hyphen-minus
+      '-',
+      0x2010, // hyphen
+      0x2011, // non-breaking hyphen
+      0x2012, // figure dash
+      0x2013, // en dash
+      0x2014, // em dash
+      0x2015, // horizontal bar
+      // how about 0x2043 hyphen bullet?
+      // how about 0x2500 box drawings light horizontal?
+      0x207b, // superscript minus
+      0x208b, // subscript minus
+      0x2212, // minus sign
+      0xfe58, // small em dash
+      0xfe63, // small hyphen-minus
+      0xff0d, // fullwidth hyphen-minus
+      0x2e17, // double oblique hyphen (Fraktur)
   };
   for (int kHyphenPuncUnicode : kHyphenPuncUnicodes) {
     if (kHyphenPuncUnicode == ch) {
@@ -61,6 +70,7 @@ static bool is_single_quote(const char32 ch) {
       0x2018, // left single quotation mark (English, others)
       0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
               // We may have to introduce a comma set with 0x201a
+      0x201A, // single low-9 quotation mark (German)
       0x201B, // single high-reversed-9 quotation mark (PropList.txt)
       0x2032, // prime
       0x300C, // left corner bracket (East Asian languages)
@@ -82,6 +92,7 @@ static bool is_double_quote(const char32 ch) {
       0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
       0x201F, // double high-reversed-9 quotation mark (PropList.txt)
       0x2033, // double prime
+      0x201E, // double low-9 quotation mark (German)
       0x301D, // reversed double prime quotation mark (East Asian langs,
               // horiz.)
       0x301E, // close double prime (East Asian languages written horizontally)

From 162714dbd8d9c34aedf47d55411023fbe894344c Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Tue, 5 Mar 2024 13:09:31 +0100
Subject: [PATCH 2/2] fix initializer syntax

---
 src/training/unicharset/normstrngs.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/training/unicharset/normstrngs.cpp b/src/training/unicharset/normstrngs.cpp
index 930073592e..22dbdaa437 100644
--- a/src/training/unicharset/normstrngs.cpp
+++ b/src/training/unicharset/normstrngs.cpp
@@ -53,7 +53,7 @@ static bool is_hyphen_punc(const char32 ch) {
       0xfe58, // small em dash
       0xfe63, // small hyphen-minus
       0xff0d, // fullwidth hyphen-minus
-      0x2e17, // double oblique hyphen (Fraktur)
+      0x2e17  // double oblique hyphen (Fraktur)
   };
   for (int kHyphenPuncUnicode : kHyphenPuncUnicodes) {
     if (kHyphenPuncUnicode == ch) {
@@ -74,7 +74,7 @@ static bool is_single_quote(const char32 ch) {
       0x201B, // single high-reversed-9 quotation mark (PropList.txt)
       0x2032, // prime
       0x300C, // left corner bracket (East Asian languages)
-      0xFF07, // fullwidth apostrophe
+      0xFF07  // fullwidth apostrophe
   };
   for (int kSingleQuoteUnicode : kSingleQuoteUnicodes) {
     if (kSingleQuoteUnicode == ch) {
@@ -96,7 +96,7 @@ static bool is_double_quote(const char32 ch) {
       0x301D, // reversed double prime quotation mark (East Asian langs,
               // horiz.)
       0x301E, // close double prime (East Asian languages written horizontally)
-      0xFF02, // fullwidth quotation mark
+      0xFF02  // fullwidth quotation mark
   };
   for (int kDoubleQuoteUnicode : kDoubleQuoteUnicodes) {
     if (kDoubleQuoteUnicode == ch) {