Skip to content

Commit

Permalink
Allow specifying ocr character whitelist and blacklist (tessedit_char…
Browse files Browse the repository at this point in the history
…_whitelist and tessedit_char_blacklist)
  • Loading branch information
manisandro committed Jul 25, 2019
1 parent 84efb17 commit 88c3491
Show file tree
Hide file tree
Showing 7 changed files with 308 additions and 0 deletions.
117 changes: 117 additions & 0 deletions gtk/data/MainWindow.ui
Original file line number Diff line number Diff line change
Expand Up @@ -1390,6 +1390,123 @@
</object>
</child>
</object>
<object class="GtkDialog" id="dialogCharacterLists">
<property name="can_focus">False</property>
<property name="title" translatable="yes">Character whitelist / blacklist</property>
<property name="resizable">False</property>
<property name="window_position">center-on-parent</property>
<property name="destroy_with_parent">True</property>
<property name="type_hint">dialog</property>
<property name="skip_taskbar_hint">True</property>
<property name="skip_pager_hint">True</property>
<property name="transient_for">windowMain</property>
<child>
<placeholder/>
</child>
<child internal-child="vbox">
<object class="GtkBox">
<property name="can_focus">False</property>
<property name="orientation">vertical</property>
<property name="spacing">2</property>
<child internal-child="action_area">
<object class="GtkButtonBox">
<property name="can_focus">False</property>
<property name="layout_style">end</property>
<child>
<object class="GtkButton" id="buttonClose">
<property name="label">gtk-close</property>
<property name="visible">True</property>
<property name="can_focus">True</property>
<property name="receives_default">True</property>
<property name="use_stock">True</property>
</object>
<packing>
<property name="expand">True</property>
<property name="fill">True</property>
<property name="position">0</property>
</packing>
</child>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">False</property>
<property name="position">0</property>
</packing>
</child>
<child>
<object class="GtkBox">
<property name="visible">True</property>
<property name="can_focus">False</property>
<property name="orientation">vertical</property>
<child>
<object class="GtkRadioButton" id="radioButtonBlacklist">
<property name="label" translatable="yes">Exclude these characters from being recognized (blacklist):</property>
<property name="visible">True</property>
<property name="can_focus">True</property>
<property name="receives_default">False</property>
<property name="active">True</property>
<property name="draw_indicator">True</property>
<property name="group">radioButtonWhitelist</property>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">0</property>
</packing>
</child>
<child>
<object class="GtkEntry" id="entryBlacklist">
<property name="visible">True</property>
<property name="can_focus">True</property>
<property name="placeholder_text" translatable="yes">Enter a string of characters...</property>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">1</property>
</packing>
</child>
<child>
<object class="GtkRadioButton" id="radioButtonWhitelist">
<property name="label" translatable="yes">Only allow the following characters to be recognized (whitelist):</property>
<property name="visible">True</property>
<property name="can_focus">True</property>
<property name="receives_default">False</property>
<property name="active">True</property>
<property name="draw_indicator">True</property>
<property name="group">radioButtonBlacklist</property>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">2</property>
</packing>
</child>
<child>
<object class="GtkEntry" id="entryWhitelist">
<property name="visible">True</property>
<property name="can_focus">True</property>
<property name="placeholder_text" translatable="yes">Enter a string of characters...</property>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">3</property>
</packing>
</child>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">1</property>
</packing>
</child>
</object>
</child>
<action-widgets>
<action-widget response="-5">buttonClose</action-widget>
</action-widgets>
</object>
<object class="GtkDialog" id="dialogPageRange">
<property name="can_focus">False</property>
<property name="border_width">5</property>
Expand Down
21 changes: 21 additions & 0 deletions gtk/data/org.gnome.gimagereader.gschema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -286,5 +286,26 @@
<summary>PDF export version</summary>
<description>PDF export version.</description>
</key>

<key type="b" name="ocrblacklistenabled">
<default>true</default>
<summary>OCR character blacklist enabled</summary>
<description>OCR character blacklist enabled.</description>
</key>
<key type="b" name="ocrwhitelistenabled">
<default>false</default>
<summary>OCR character whitelist enabled</summary>
<description>OCR character whitelist enabled.</description>
</key>
<key type="s" name="ocrcharblacklist">
<default>""</default>
<summary>OCR character whitelist</summary>
<description>OCR character whitelist.</description>
</key>
<key type="s" name="ocrcharwhitelist">
<default>""</default>
<summary>OCR character whitelist</summary>
<description>OCR character whitelist.</description>
</key>
</schema>
</schemalist>
21 changes: 21 additions & 0 deletions gtk/src/Recognizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,17 @@ Recognizer::Recognizer(const Ui::MainWindow& _ui)
Utils::clear_error_state(ui.entryPageRange);
return false;
});
CONNECT(ui.radioButtonBlacklist, toggled, [this] { ui.entryBlacklist->set_sensitive(ui.radioButtonBlacklist->get_active()); });
CONNECT(ui.radioButtonWhitelist, toggled, [this] { ui.entryWhitelist->set_sensitive(ui.radioButtonWhitelist->get_active()); });

ADD_SETTING(VarSetting<Glib::ustring>("language"));
ADD_SETTING(ComboSetting("ocrregionstrategy", ui.comboPageRangeRegions));
ADD_SETTING(SwitchSettingT<Gtk::CheckButton>("ocraddsourcefilename", ui.checkPageRangePrependFile));
ADD_SETTING(SwitchSettingT<Gtk::CheckButton>("ocraddsourcepage", ui.checkPageRangePrependPage));
ADD_SETTING(EntrySetting("ocrcharwhitelist", ui.entryWhitelist));
ADD_SETTING(EntrySetting("ocrcharblacklist", ui.entryBlacklist));
ADD_SETTING(SwitchSettingT<Gtk::RadioButton>("ocrblacklistenabled", ui.radioButtonBlacklist));
ADD_SETTING(SwitchSettingT<Gtk::RadioButton>("ocrwhitelistenabled", ui.radioButtonWhitelist));
ADD_SETTING(VarSetting<int>("psm"));
}

Expand Down Expand Up @@ -304,6 +310,10 @@ void Recognizer::updateLanguagesMenu() {
psmItem->set_submenu(*psmMenu);
ui.menuLanguages->append(*psmItem);

Gtk::MenuItem* charlistItem = Gtk::manage(new Gtk::MenuItem(_("Character whitelist / blacklist...")));
CONNECT(charlistItem, activate, [this] { manageCharaterLists(); });
ui.menuLanguages->append(*charlistItem);

// Add installer item
ui.menuLanguages->append(*Gtk::manage(new Gtk::SeparatorMenuItem()));
Gtk::MenuItem* manageItem = Gtk::manage(new Gtk::MenuItem(_("Manage languages...")));
Expand Down Expand Up @@ -403,6 +413,11 @@ std::vector<int> Recognizer::selectPages(bool& autodetectLayout) {
return pages;
}

void Recognizer::manageCharaterLists() {
ui.dialogCharacterLists->run();
ui.dialogCharacterLists->hide();
}

void Recognizer::recognizeButtonClicked() {
int nPages = MAIN->getDisplayer()->getNPages();
if(nPages == 1) {
Expand Down Expand Up @@ -431,6 +446,12 @@ void Recognizer::recognize(const std::vector<int>& pages, bool autodetectLayout)
if(ok) {
Glib::ustring failed;
tess.SetPageSegMode(static_cast<tesseract::PageSegMode>(m_currentPsmMode));
if(ui.radioButtonWhitelist->get_active()) {
tess.SetVariable("tessedit_char_whitelist", ui.entryWhitelist->get_text().c_str());
}
if(ui.radioButtonBlacklist->get_active()) {
tess.SetVariable("tessedit_char_blacklist", ui.entryBlacklist->get_text().c_str());
}
OutputEditor::ReadSessionData* readSessionData = MAIN->getOutputEditor()->initRead(tess);
ProgressMonitor monitor(pages.size());
MAIN->showProgress(&monitor);
Expand Down
1 change: 1 addition & 0 deletions gtk/src/Recognizer.hh
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ private:
sigc::signal<void, Config::Lang> m_signal_languageChanged;

tesseract::TessBaseAPI initTesseract(const char* language = nullptr, bool* ok = nullptr) const;
void manageCharaterLists();
void recognizeButtonClicked();
void recognizeCurrentPage();
void recognizeMultiplePages();
Expand Down
124 changes: 124 additions & 0 deletions qt/data/CharacterListDialog.ui
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>CharacterListDialog</class>
<widget class="QDialog" name="CharacterListDialog">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>427</width>
<height>160</height>
</rect>
</property>
<property name="windowTitle">
<string>Character whitelist / blacklist</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QRadioButton" name="radioButtonBlacklist">
<property name="text">
<string>Exclude these characters from being recognized (blacklist):</string>
</property>
<attribute name="buttonGroup">
<string notr="true">buttonGroup</string>
</attribute>
</widget>
</item>
<item>
<widget class="QLineEdit" name="lineEditBlacklist">
<property name="enabled">
<bool>false</bool>
</property>
<property name="placeholderText">
<string>Enter a string of characters...</string>
</property>
</widget>
</item>
<item>
<widget class="QRadioButton" name="radioButtonWhitelist">
<property name="text">
<string>Only allow the following characters to be recognized (whitelist):</string>
</property>
<attribute name="buttonGroup">
<string notr="true">buttonGroup</string>
</attribute>
</widget>
</item>
<item>
<widget class="QLineEdit" name="lineEditWhitelist">
<property name="enabled">
<bool>false</bool>
</property>
<property name="placeholderText">
<string>Enter a string of characters...</string>
</property>
</widget>
</item>
<item>
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>40</height>
</size>
</property>
</spacer>
</item>
<item>
<widget class="QDialogButtonBox" name="buttonBox">
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
<property name="standardButtons">
<set>QDialogButtonBox::Close</set>
</property>
</widget>
</item>
</layout>
</widget>
<resources/>
<connections>
<connection>
<sender>buttonBox</sender>
<signal>accepted()</signal>
<receiver>CharacterListDialog</receiver>
<slot>accept()</slot>
<hints>
<hint type="sourcelabel">
<x>248</x>
<y>254</y>
</hint>
<hint type="destinationlabel">
<x>157</x>
<y>274</y>
</hint>
</hints>
</connection>
<connection>
<sender>buttonBox</sender>
<signal>rejected()</signal>
<receiver>CharacterListDialog</receiver>
<slot>reject()</slot>
<hints>
<hint type="sourcelabel">
<x>316</x>
<y>260</y>
</hint>
<hint type="destinationlabel">
<x>286</x>
<y>274</y>
</hint>
</hints>
</connection>
</connections>
<buttongroups>
<buttongroup name="buttonGroup">
<property name="exclusive">
<bool>true</bool>
</property>
</buttongroup>
</buttongroups>
</ui>
20 changes: 20 additions & 0 deletions qt/src/Recognizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,27 @@ Recognizer::Recognizer(const UI_MainWindow& _ui) :
m_pagesDialog = new QDialog(MAIN);
m_pagesDialogUi.setupUi(m_pagesDialog);

m_charListDialog = new QDialog(MAIN);
m_charListDialogUi.setupUi(m_charListDialog);

ui.toolButtonRecognize->setText(QString("%1\n%2").arg(m_modeLabel).arg(m_langLabel));
ui.menuLanguages->installEventFilter(this);

connect(ui.toolButtonRecognize, SIGNAL(clicked()), this, SLOT(recognizeButtonClicked()));
connect(currentPageAction, SIGNAL(triggered()), this, SLOT(recognizeCurrentPage()));
connect(multiplePagesAction, SIGNAL(triggered()), this, SLOT(recognizeMultiplePages()));
connect(m_pagesDialogUi.lineEditPageRange, SIGNAL(textChanged(QString)), this, SLOT(clearLineEditPageRangeStyle()));
connect(m_charListDialogUi.radioButtonBlacklist, SIGNAL(toggled(bool)), m_charListDialogUi.lineEditBlacklist, SLOT(setEnabled(bool)));
connect(m_charListDialogUi.radioButtonWhitelist, SIGNAL(toggled(bool)), m_charListDialogUi.lineEditWhitelist, SLOT(setEnabled(bool)));

ADD_SETTING(VarSetting<QString>("language", "eng:en_EN"));
ADD_SETTING(ComboSetting("ocrregionstrategy", m_pagesDialogUi.comboBoxRecognitionArea, 0));
ADD_SETTING(SwitchSetting("ocraddsourcefilename", m_pagesDialogUi.checkBoxPrependFilename));
ADD_SETTING(SwitchSetting("ocraddsourcepage", m_pagesDialogUi.checkBoxPrependPage));
ADD_SETTING(LineEditSetting("ocrcharwhitelist", m_charListDialogUi.lineEditWhitelist));
ADD_SETTING(LineEditSetting("ocrcharblacklist", m_charListDialogUi.lineEditBlacklist));
ADD_SETTING(SwitchSetting("ocrblacklistenabled", m_charListDialogUi.radioButtonBlacklist, true));
ADD_SETTING(SwitchSetting("ocrwhitelistenabled", m_charListDialogUi.radioButtonWhitelist, false));
ADD_SETTING(VarSetting<int>("psm", 6));
}

Expand Down Expand Up @@ -288,6 +297,7 @@ void Recognizer::updateLanguagesMenu() {
QAction* psmAction = new QAction(_("Page segmentation mode"), ui.menuLanguages);
psmAction->setMenu(psmMenu);
ui.menuLanguages->addAction(psmAction);
ui.menuLanguages->addAction(_("Character whitelist / blacklist..."), this, SLOT(manageCharacterLists()));


// Add installer item
Expand Down Expand Up @@ -343,6 +353,10 @@ void Recognizer::psmSelected(QAction* action) {
ConfigSettings::get<VarSetting<int>>("psm")->setValue(action->data().toInt());
}

void Recognizer::manageCharacterLists() {
m_charListDialog->exec();
}

QList<int> Recognizer::selectPages(bool& autodetectLayout) {
int nPages = MAIN->getDisplayer()->getNPages();

Expand Down Expand Up @@ -422,6 +436,12 @@ void Recognizer::recognize(const QList<int>& pages, bool autodetectLayout) {
if(ok) {
QString failed;
tess.SetPageSegMode(static_cast<tesseract::PageSegMode>(m_psmCheckGroup->checkedAction()->data().toInt()));
if(m_charListDialogUi.radioButtonWhitelist->isChecked()) {
tess.SetVariable("tessedit_char_whitelist", m_charListDialogUi.lineEditWhitelist->text().toLocal8Bit());
}
if(m_charListDialogUi.radioButtonBlacklist->isChecked()) {
tess.SetVariable("tessedit_char_blacklist", m_charListDialogUi.lineEditBlacklist->text().toLocal8Bit());
}
OutputEditor::ReadSessionData* readSessionData = MAIN->getOutputEditor()->initRead(tess);
ProgressMonitor monitor(pages.size());
MAIN->showProgress(&monitor);
Expand Down
Loading

0 comments on commit 88c3491

Please sign in to comment.