From 88c3491df15ba548f42dc51813867bdbbebc16ba Mon Sep 17 00:00:00 2001 From: Sandro Mani Date: Thu, 25 Jul 2019 22:59:52 +0200 Subject: [PATCH] Allow specifying ocr character whitelist and blacklist (tessedit_char_whitelist and tessedit_char_blacklist) --- gtk/data/MainWindow.ui | 117 ++++++++++++++++++ gtk/data/org.gnome.gimagereader.gschema.xml | 21 ++++ gtk/src/Recognizer.cc | 21 ++++ gtk/src/Recognizer.hh | 1 + qt/data/CharacterListDialog.ui | 124 ++++++++++++++++++++ qt/src/Recognizer.cc | 20 ++++ qt/src/Recognizer.hh | 4 + 7 files changed, 308 insertions(+) create mode 100644 qt/data/CharacterListDialog.ui diff --git a/gtk/data/MainWindow.ui b/gtk/data/MainWindow.ui index b5d25245..8185fe8f 100644 --- a/gtk/data/MainWindow.ui +++ b/gtk/data/MainWindow.ui @@ -1390,6 +1390,123 @@ + + False + Character whitelist / blacklist + False + center-on-parent + True + dialog + True + True + windowMain + + + + + + False + vertical + 2 + + + False + end + + + gtk-close + True + True + True + True + + + True + True + 0 + + + + + False + False + 0 + + + + + True + False + vertical + + + Exclude these characters from being recognized (blacklist): + True + True + False + True + True + radioButtonWhitelist + + + False + True + 0 + + + + + True + True + Enter a string of characters... + + + False + True + 1 + + + + + Only allow the following characters to be recognized (whitelist): + True + True + False + True + True + radioButtonBlacklist + + + False + True + 2 + + + + + True + True + Enter a string of characters... + + + False + True + 3 + + + + + False + True + 1 + + + + + + buttonClose + + False 5 diff --git a/gtk/data/org.gnome.gimagereader.gschema.xml b/gtk/data/org.gnome.gimagereader.gschema.xml index 207e83c1..45846f2c 100644 --- a/gtk/data/org.gnome.gimagereader.gschema.xml +++ b/gtk/data/org.gnome.gimagereader.gschema.xml @@ -286,5 +286,26 @@ PDF export version PDF export version. + + + true + OCR character blacklist enabled + OCR character blacklist enabled. + + + false + OCR character whitelist enabled + OCR character whitelist enabled. + + + "" + OCR character whitelist + OCR character whitelist. + + + "" + OCR character whitelist + OCR character whitelist. + diff --git a/gtk/src/Recognizer.cc b/gtk/src/Recognizer.cc index 13af6400..69fa9111 100644 --- a/gtk/src/Recognizer.cc +++ b/gtk/src/Recognizer.cc @@ -106,11 +106,17 @@ Recognizer::Recognizer(const Ui::MainWindow& _ui) Utils::clear_error_state(ui.entryPageRange); return false; }); + CONNECT(ui.radioButtonBlacklist, toggled, [this] { ui.entryBlacklist->set_sensitive(ui.radioButtonBlacklist->get_active()); }); + CONNECT(ui.radioButtonWhitelist, toggled, [this] { ui.entryWhitelist->set_sensitive(ui.radioButtonWhitelist->get_active()); }); ADD_SETTING(VarSetting("language")); ADD_SETTING(ComboSetting("ocrregionstrategy", ui.comboPageRangeRegions)); ADD_SETTING(SwitchSettingT("ocraddsourcefilename", ui.checkPageRangePrependFile)); ADD_SETTING(SwitchSettingT("ocraddsourcepage", ui.checkPageRangePrependPage)); + ADD_SETTING(EntrySetting("ocrcharwhitelist", ui.entryWhitelist)); + ADD_SETTING(EntrySetting("ocrcharblacklist", ui.entryBlacklist)); + ADD_SETTING(SwitchSettingT("ocrblacklistenabled", ui.radioButtonBlacklist)); + ADD_SETTING(SwitchSettingT("ocrwhitelistenabled", ui.radioButtonWhitelist)); ADD_SETTING(VarSetting("psm")); } @@ -304,6 +310,10 @@ void Recognizer::updateLanguagesMenu() { psmItem->set_submenu(*psmMenu); ui.menuLanguages->append(*psmItem); + Gtk::MenuItem* charlistItem = Gtk::manage(new Gtk::MenuItem(_("Character whitelist / blacklist..."))); + CONNECT(charlistItem, activate, [this] { manageCharaterLists(); }); + ui.menuLanguages->append(*charlistItem); + // Add installer item ui.menuLanguages->append(*Gtk::manage(new Gtk::SeparatorMenuItem())); Gtk::MenuItem* manageItem = Gtk::manage(new Gtk::MenuItem(_("Manage languages..."))); @@ -403,6 +413,11 @@ std::vector Recognizer::selectPages(bool& autodetectLayout) { return pages; } +void Recognizer::manageCharaterLists() { + ui.dialogCharacterLists->run(); + ui.dialogCharacterLists->hide(); +} + void Recognizer::recognizeButtonClicked() { int nPages = MAIN->getDisplayer()->getNPages(); if(nPages == 1) { @@ -431,6 +446,12 @@ void Recognizer::recognize(const std::vector& pages, bool autodetectLayout) if(ok) { Glib::ustring failed; tess.SetPageSegMode(static_cast(m_currentPsmMode)); + if(ui.radioButtonWhitelist->get_active()) { + tess.SetVariable("tessedit_char_whitelist", ui.entryWhitelist->get_text().c_str()); + } + if(ui.radioButtonBlacklist->get_active()) { + tess.SetVariable("tessedit_char_blacklist", ui.entryBlacklist->get_text().c_str()); + } OutputEditor::ReadSessionData* readSessionData = MAIN->getOutputEditor()->initRead(tess); ProgressMonitor monitor(pages.size()); MAIN->showProgress(&monitor); diff --git a/gtk/src/Recognizer.hh b/gtk/src/Recognizer.hh index d68c6c04..72f0fff0 100644 --- a/gtk/src/Recognizer.hh +++ b/gtk/src/Recognizer.hh @@ -75,6 +75,7 @@ private: sigc::signal m_signal_languageChanged; tesseract::TessBaseAPI initTesseract(const char* language = nullptr, bool* ok = nullptr) const; + void manageCharaterLists(); void recognizeButtonClicked(); void recognizeCurrentPage(); void recognizeMultiplePages(); diff --git a/qt/data/CharacterListDialog.ui b/qt/data/CharacterListDialog.ui new file mode 100644 index 00000000..f7cde6a3 --- /dev/null +++ b/qt/data/CharacterListDialog.ui @@ -0,0 +1,124 @@ + + + CharacterListDialog + + + + 0 + 0 + 427 + 160 + + + + Character whitelist / blacklist + + + + + + Exclude these characters from being recognized (blacklist): + + + buttonGroup + + + + + + + false + + + Enter a string of characters... + + + + + + + Only allow the following characters to be recognized (whitelist): + + + buttonGroup + + + + + + + false + + + Enter a string of characters... + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + + + + Qt::Horizontal + + + QDialogButtonBox::Close + + + + + + + + + buttonBox + accepted() + CharacterListDialog + accept() + + + 248 + 254 + + + 157 + 274 + + + + + buttonBox + rejected() + CharacterListDialog + reject() + + + 316 + 260 + + + 286 + 274 + + + + + + + + true + + + + diff --git a/qt/src/Recognizer.cc b/qt/src/Recognizer.cc index 28855085..73e16048 100644 --- a/qt/src/Recognizer.cc +++ b/qt/src/Recognizer.cc @@ -86,6 +86,9 @@ Recognizer::Recognizer(const UI_MainWindow& _ui) : m_pagesDialog = new QDialog(MAIN); m_pagesDialogUi.setupUi(m_pagesDialog); + m_charListDialog = new QDialog(MAIN); + m_charListDialogUi.setupUi(m_charListDialog); + ui.toolButtonRecognize->setText(QString("%1\n%2").arg(m_modeLabel).arg(m_langLabel)); ui.menuLanguages->installEventFilter(this); @@ -93,11 +96,17 @@ Recognizer::Recognizer(const UI_MainWindow& _ui) : connect(currentPageAction, SIGNAL(triggered()), this, SLOT(recognizeCurrentPage())); connect(multiplePagesAction, SIGNAL(triggered()), this, SLOT(recognizeMultiplePages())); connect(m_pagesDialogUi.lineEditPageRange, SIGNAL(textChanged(QString)), this, SLOT(clearLineEditPageRangeStyle())); + connect(m_charListDialogUi.radioButtonBlacklist, SIGNAL(toggled(bool)), m_charListDialogUi.lineEditBlacklist, SLOT(setEnabled(bool))); + connect(m_charListDialogUi.radioButtonWhitelist, SIGNAL(toggled(bool)), m_charListDialogUi.lineEditWhitelist, SLOT(setEnabled(bool))); ADD_SETTING(VarSetting("language", "eng:en_EN")); ADD_SETTING(ComboSetting("ocrregionstrategy", m_pagesDialogUi.comboBoxRecognitionArea, 0)); ADD_SETTING(SwitchSetting("ocraddsourcefilename", m_pagesDialogUi.checkBoxPrependFilename)); ADD_SETTING(SwitchSetting("ocraddsourcepage", m_pagesDialogUi.checkBoxPrependPage)); + ADD_SETTING(LineEditSetting("ocrcharwhitelist", m_charListDialogUi.lineEditWhitelist)); + ADD_SETTING(LineEditSetting("ocrcharblacklist", m_charListDialogUi.lineEditBlacklist)); + ADD_SETTING(SwitchSetting("ocrblacklistenabled", m_charListDialogUi.radioButtonBlacklist, true)); + ADD_SETTING(SwitchSetting("ocrwhitelistenabled", m_charListDialogUi.radioButtonWhitelist, false)); ADD_SETTING(VarSetting("psm", 6)); } @@ -288,6 +297,7 @@ void Recognizer::updateLanguagesMenu() { QAction* psmAction = new QAction(_("Page segmentation mode"), ui.menuLanguages); psmAction->setMenu(psmMenu); ui.menuLanguages->addAction(psmAction); + ui.menuLanguages->addAction(_("Character whitelist / blacklist..."), this, SLOT(manageCharacterLists())); // Add installer item @@ -343,6 +353,10 @@ void Recognizer::psmSelected(QAction* action) { ConfigSettings::get>("psm")->setValue(action->data().toInt()); } +void Recognizer::manageCharacterLists() { + m_charListDialog->exec(); +} + QList Recognizer::selectPages(bool& autodetectLayout) { int nPages = MAIN->getDisplayer()->getNPages(); @@ -422,6 +436,12 @@ void Recognizer::recognize(const QList& pages, bool autodetectLayout) { if(ok) { QString failed; tess.SetPageSegMode(static_cast(m_psmCheckGroup->checkedAction()->data().toInt())); + if(m_charListDialogUi.radioButtonWhitelist->isChecked()) { + tess.SetVariable("tessedit_char_whitelist", m_charListDialogUi.lineEditWhitelist->text().toLocal8Bit()); + } + if(m_charListDialogUi.radioButtonBlacklist->isChecked()) { + tess.SetVariable("tessedit_char_blacklist", m_charListDialogUi.lineEditBlacklist->text().toLocal8Bit()); + } OutputEditor::ReadSessionData* readSessionData = MAIN->getOutputEditor()->initRead(tess); ProgressMonitor monitor(pages.size()); MAIN->showProgress(&monitor); diff --git a/qt/src/Recognizer.hh b/qt/src/Recognizer.hh index 34449ed5..28a3c836 100644 --- a/qt/src/Recognizer.hh +++ b/qt/src/Recognizer.hh @@ -25,6 +25,7 @@ #include "Config.hh" #include "Displayer.hh" #include "ui_PageRangeDialog.h" +#include "ui_CharacterListDialog.h" namespace tesseract { class TessBaseAPI; @@ -68,6 +69,8 @@ private: QMenu* m_menuMultilanguage = nullptr; QDialog* m_pagesDialog; Ui::PageRangeDialog m_pagesDialogUi; + QDialog* m_charListDialog; + Ui::CharacterListDialog m_charListDialogUi; QActionGroup* m_langMenuRadioGroup = nullptr; QActionGroup* m_langMenuCheckGroup = nullptr; QActionGroup* m_psmCheckGroup = nullptr; @@ -83,6 +86,7 @@ private: private slots: void clearLineEditPageRangeStyle(); + void manageCharacterLists(); void psmSelected(QAction* action); void recognizeButtonClicked(); void recognizeCurrentPage();