Skip to content

Commit

Permalink
not add space character for CJK.
Browse files Browse the repository at this point in the history
  • Loading branch information
yu committed Jun 22, 2018
1 parent 2a40945 commit fe04fd3
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 3 deletions.
16 changes: 16 additions & 0 deletions qt/src/Utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -221,3 +221,19 @@ QString Utils::getSpellingLanguage(const QString& lang) {
}
return syslang;
}

// Unicode blocks http://www.fileformat.info/info/unicode/block/index.htm
bool Utils::spacedWord(const QString& text, bool prevWord) {
short unicode = (prevWord ? text.back() : text.front()).unicode();
// CJK Word
std::vector<std::pair<int, int>> cjkWordRange{{0x2480, 0x303f}, {0x31c0, 0x9fff}
, {0xf900, 0xfaff}, {0xfe30, 0xfe4f}, {0x20000, 0x2fa1f}};
for(int i = 0; i < cjkWordRange.size(); i++) {
if(unicode < cjkWordRange[i].first) {
return true;
} else if(unicode >= cjkWordRange[i].first && unicode <= cjkWordRange[i].second) {
return false;
}
}
return true;
}
2 changes: 2 additions & 0 deletions qt/src/Utils.hh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ QByteArray download(QUrl url, QString& messages, int timeout = 60000);

QString getSpellingLanguage(const QString& lang = QString());

bool spacedWord(const QString& text, bool prevWord);

template<typename T>
class AsyncQueue {
public:
Expand Down
17 changes: 14 additions & 3 deletions qt/src/hocr/HOCRPdfExporter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,8 @@ void HOCRPdfExporter::printChildren(PDFPainter& painter, const HOCRItem* item, c
QString itemClass = item->itemClass();
QRect itemRect = item->bbox();
int childCount = item->children().size();
bool prevSpacedWord, currentSpacedWord;
prevSpacedWord = currentSpacedWord = false;
if(itemClass == "ocr_par" && pdfSettings.uniformizeLineSpacing) {
double yInc = double(itemRect.height()) / childCount;
double y = itemRect.top() + yInc;
Expand All @@ -706,15 +708,24 @@ void HOCRPdfExporter::printChildren(PDFPainter& painter, const HOCRItem* item, c
if(pdfSettings.fontSize == -1) {
painter.setFontSize(wordItem->fontSize() * pdfSettings.detectedFontScaling);
}

prevWordRight = wordRect.right();
QString text = wordItem->text();
currentSpacedWord = Utils::spacedWord(text, false);
// If distance from previous word is large, keep the space
if(wordRect.x() - prevWordRight > pdfSettings.preserveSpaceWidth * painter.getAverageCharWidth() / px2pu) {
x = wordRect.x();
} else {
//need space
if(currentSpacedWord && prevSpacedWord ) {
x += painter.getTextWidth(" ") / px2pu;
}
}
prevWordRight = wordRect.right();
QString text = wordItem->text();

double wordBaseline = (x - itemRect.x()) * baseline.first + baseline.second;
painter.drawText(x * px2pu, (y + wordBaseline) * px2pu, text);
x += painter.getTextWidth(text + " ") / px2pu;
x += painter.getTextWidth(text) / px2pu;
prevSpacedWord = Utils::spacedWord(text, true);
}
}
} else if(itemClass == "ocr_line" && !pdfSettings.uniformizeLineSpacing) {
Expand Down

0 comments on commit fe04fd3

Please sign in to comment.