Skip to content

Commit

Permalink
Correctly process page ratio and parse things out for miniCOR
Browse files Browse the repository at this point in the history
removed all data too
  • Loading branch information
DiegoPino committed Nov 24, 2020
1 parent f1073bd commit 0442676
Showing 1 changed file with 15 additions and 6 deletions.
21 changes: 15 additions & 6 deletions src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug
if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) {
// To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber}
$page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1;
$pageid = $node_uuid . '/canvas/' . $file_uuid . '/p' . $page_number;
// $pageid not used for now, let's go with the page_number only

setlocale(LC_CTYPE, 'en_US.UTF-8');
$execstring = $this->buildExecutableCommand($io);
error_log($execstring);
Expand Down Expand Up @@ -306,7 +305,6 @@ public function buildExecutableCommand(\stdClass $io) {
}

protected function hOCRtoMiniOCR($output, $pageid) {
error_log($output);
$hocr = simplexml_load_string($output);
$internalErrors = libxml_use_internal_errors(TRUE);
libxml_clear_errors();
Expand All @@ -320,7 +318,20 @@ protected function hOCRtoMiniOCR($output, $pageid) {
$miniocr->startDocument('1.0', 'UTF-8');
$miniocr->startElement("ocr");
foreach ($hocr->body->children() as $page) {
$coos = explode(" ", substr($page['title'], 5));
$titleparts = explode(';', $page['title']);
$pagetitle = NULL;
foreach ($titleparts as $titlepart) {
$titlepart = trim($titlepart);
if (strpos($titlepart, 'bbox') === 0 ) {
$pagetitle = substr($titlepart, 5);
}
}
if ($pagetitle == NULL) {
$miniocr->flush();
error_log('Could not convert HOCR to MiniOCR, no valid page dimensions found');
return NULL;
}
$coos = explode(" ", $pagetitle);
// To avoid divisions by 0
$pwidth = (float) $coos[2] ? (float) $coos[2] : 1;
$pheight = (float) $coos[3] ? (float) $coos[3] : 1;
Expand All @@ -346,7 +357,6 @@ protected function hOCRtoMiniOCR($output, $pageid) {
$text = (string) $word;
$miniocr->startElement("w");
$miniocr->writeAttribute("x", $l . ' ' . $t . ' ' . $w . ' ' . $h);

$miniocr->text($text);
$miniocr->endElement();
}
Expand All @@ -360,7 +370,6 @@ protected function hOCRtoMiniOCR($output, $pageid) {
$miniocr->endElement();
$miniocr->endDocument();
unset($hocr);

return $miniocr->outputMemory(TRUE);
}

Expand Down

0 comments on commit 0442676

Please sign in to comment.