diff --git a/qt/src/Utils.cc b/qt/src/Utils.cc index 32cc1662..1ad7dc37 100644 --- a/qt/src/Utils.cc +++ b/qt/src/Utils.cc @@ -221,3 +221,19 @@ QString Utils::getSpellingLanguage(const QString& lang) { } return syslang; } + +// Unicode blocks http://www.fileformat.info/info/unicode/block/index.htm +bool Utils::spacedWord(const QString& text, bool prevWord) { + short unicode = (prevWord ? text.at(text.size()-1) : text.at(0)).unicode(); + // CJK Word + std::vector> cjkWordRange{{0x2480, 0x303f}, {0x31c0, 0x9fff} + , {0xf900, 0xfaff}, {0xfe30, 0xfe4f}, {0x20000, 0x2fa1f}}; + for(int i = 0; i < cjkWordRange.size(); i++) { + if(unicode < cjkWordRange[i].first) { + return true; + } else if(unicode >= cjkWordRange[i].first && unicode <= cjkWordRange[i].second) { + return false; + } + } + return true; +} diff --git a/qt/src/Utils.hh b/qt/src/Utils.hh index e87aa988..bea28199 100644 --- a/qt/src/Utils.hh +++ b/qt/src/Utils.hh @@ -53,6 +53,8 @@ QByteArray download(QUrl url, QString& messages, int timeout = 60000); QString getSpellingLanguage(const QString& lang = QString()); +bool spacedWord(const QString& text, bool prevWord); + template class AsyncQueue { public: diff --git a/qt/src/hocr/HOCRPdfExporter.cc b/qt/src/hocr/HOCRPdfExporter.cc index 136b0687..e4c6b3ee 100644 --- a/qt/src/hocr/HOCRPdfExporter.cc +++ b/qt/src/hocr/HOCRPdfExporter.cc @@ -685,6 +685,8 @@ void HOCRPdfExporter::printChildren(PDFPainter& painter, const HOCRItem* item, c QString itemClass = item->itemClass(); QRect itemRect = item->bbox(); int childCount = item->children().size(); + bool prevSpacedWord, currentSpacedWord; + prevSpacedWord = currentSpacedWord = false; if(itemClass == "ocr_par" && pdfSettings.uniformizeLineSpacing) { double yInc = double(itemRect.height()) / childCount; double y = itemRect.top() + yInc; @@ -703,15 +705,24 @@ void HOCRPdfExporter::printChildren(PDFPainter& painter, const HOCRItem* item, c if(pdfSettings.fontSize == -1) { painter.setFontSize(wordItem->fontSize() * pdfSettings.detectedFontScaling); } + + prevWordRight = wordRect.right(); + QString text = wordItem->text(); + currentSpacedWord = Utils::spacedWord(text, false); // If distance from previous word is large, keep the space if(wordRect.x() - prevWordRight > pdfSettings.preserveSpaceWidth * painter.getAverageCharWidth() / px2pu) { x = wordRect.x(); + } else { + //need space + if(currentSpacedWord && prevSpacedWord ) { + x += painter.getTextWidth(" ") / px2pu; + } } - prevWordRight = wordRect.right(); - QString text = wordItem->text(); + double wordBaseline = (x - itemRect.x()) * baseline.first + baseline.second; painter.drawText(x * px2pu, (y + wordBaseline) * px2pu, text); - x += painter.getTextWidth(text + " ") / px2pu; + x += painter.getTextWidth(text) / px2pu; + prevSpacedWord = Utils::spacedWord(text, true); } } } else if(itemClass == "ocr_line" && !pdfSettings.uniformizeLineSpacing) {