diff --git a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php
index ba18063..6d1c715 100644
--- a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php
+++ b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php
@@ -193,7 +193,7 @@ public function processItem($data) {
error_log(empty($processed_data));
//@TODO allow a force in case of corrupted key value? Partial output
// Extragenous weird data?
- if (empty($processed_data) ||
+ if (true || empty($processed_data) ||
$data->force == TRUE ||
(!isset($processed_data->checksum) ||
empty($processed_data->checksum) ||
@@ -219,6 +219,7 @@ public function processItem($data) {
$toindex = new \stdClass();
$toindex->fulltext = $io->output;
$toindex->checksum = $data->metadata['checksum'];
+ error_log(var_export($toindex,true));
$this->keyValue->get($keyvalue_collection)->set($key, $toindex);
// Get which indexes have our StrawberryfieldFlavorDatasource enabled!
diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php
new file mode 100644
index 0000000..1430be0
--- /dev/null
+++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php
@@ -0,0 +1,350 @@
+ 'asstructure',
+ 'mime_type' => ['application/pdf'],
+ 'path' => '',
+ 'path_tesseract' => '',
+ 'arguments' => '',
+ 'arguments_tesseract' => '',
+ 'output_type' => 'json',
+ 'output_destination' => 'subkey',
+ ] + parent::defaultConfiguration();
+ }
+
+
+ public function calculateDependencies() {
+ // Since Processors could be chained we need to check if any other
+ // processor instance is using an instance of this one
+ // @TODO: Implement calculateDependencies() method.
+ }
+
+ public function settingsForm(array $parents, FormStateInterface $form_state) {
+
+ $element['source_type'] = [
+ '#type' => 'select',
+ '#title' => $this->t('The type of source data this processor works on'),
+ '#options' => [
+ 'asstructure' => 'File entities referenced in the as:filetype JSON structure',
+ 'filepath' => 'Full file paths passed by another processor',
+ ],
+ '#default_value' => $this->getConfiguration()['source_type'],
+ '#description' => $this->t('Select from where the source file this processor needs is fetched'),
+ '#required' => TRUE
+ ];
+
+ $element['ado_type'] = [
+ '#type' => 'textfield',
+ '#title' => $this->t('ADO type(s) to limit this processor to.'),
+ '#default_value' => $this->getConfiguration()['ado_type'],
+ '#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'),
+ ];
+
+ $element['jsonkey'] = [
+ '#type' => 'checkboxes',
+ '#title' => $this->t('The JSON key that contains the desired source files.'),
+ '#options' => [
+ 'as:image' => 'as:image',
+ 'as:document' => 'as:document',
+ 'as:audio' => 'as:audio',
+ 'as:video' => 'as:video',
+ 'as:text' => 'as:text',
+ 'as:application' => 'as:application',
+ ],
+ '#default_value' => (!empty($this->getConfiguration()['jsonkey']) && is_array($this->getConfiguration()['jsonkey'])) ? $this->getConfiguration()['jsonkey'] : [],
+ '#states' => [
+ 'visible' => [
+ ':input[name="pluginconfig[source_type]"]' => ['value' => 'asstructure'],
+ ],
+ ],
+ '#required' => TRUE,
+ ];
+
+ $element['mime_type'] = [
+ '#type' => 'textfield',
+ '#title' => $this->t('Mimetypes(s) to limit this Processor to.'),
+ '#default_value' => $this->getConfiguration()['mime_type'],
+ '#description' => $this->t('A single Mimetype type or a coma separed list of mimetypes that qualify to be Processed. Leave empty to apply any file'),
+ ];
+ $element['path'] = [
+ '#type' => 'textfield',
+ '#title' => $this->t('The system path to the ghostscript (gs) binary that will be executed by this processor.'),
+ '#default_value' => $this->getConfiguration()['path'],
+ '#description' => t('A full system path to the gs binary present in the same environment your PHP runs, e.g /usr/bin/gs'),
+ '#required' => TRUE,
+ ];
+
+ $element['arguments'] = [
+ '#type' => 'textfield',
+ '#title' => $this->t('Any additional argument your executable binary requires.'),
+ '#default_value' => !empty($this->getConfiguration()['arguments']) ? $this->getConfiguration()['arguments'] : '%file',
+ '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file if the executable requires the filename to be passed under a specific argument.'),
+ '#required' => TRUE,
+ ];
+
+
+ $element['path_tesseract'] = [
+ '#type' => 'textfield',
+ '#title' => $this->t('The system path to the Tesseract binary that will be executed by this processor.'),
+ '#default_value' => $this->getConfiguration()['path_tesseract'],
+ '#description' => t('A full system path to the Tesseract binary present in the same environment your PHP runs, e.g /usr/bin/tesseract'),
+ '#required' => TRUE,
+ ];
+
+ $element['arguments_tesseract'] = [
+ '#type' => 'textfield',
+ '#title' => $this->t('Any additional argument for your tesseract binary.'),
+ '#default_value' => !empty($this->getConfiguration()['arguments_tesseract']) ? $this->getConfiguration()['arguments_tesseract'] : '%file',
+ '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file that is output but the GS binary.'),
+ '#required' => TRUE,
+ ];
+
+ $element['output_type'] = [
+ '#type' => 'select',
+ '#title' => $this->t('The expected and desired output of this processor.'),
+ '#options' => [
+ 'entity:file' => 'One or more Files',
+ 'json' => 'Data/Values that can be serialized to JSON',
+ ],
+ '#default_value' => $this->getConfiguration()['output_type'],
+ '#description' => $this->t('If the output is just data and "One or more Files" is selected all data will be dumped into a file and handled as such.'),
+ ];
+
+ $element['output_destination'] = [
+ '#type' => 'checkboxes',
+ '#title' => $this->t("Where and how the output will be used."),
+ '#options' => [
+ 'subkey' => 'In the same Source Metadata, as a child structure of each Processed file',
+ 'ownkey' => 'In the same Source Metadata but inside its own, top level, "as:flavour" subkey based on the given machine name of the current plugin',
+ 'plugin' => 'As Input for another processor Plugin',
+ ],
+ '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination']))? $this->getConfiguration()['output_destination']: [],
+ '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'),
+ '#required' => TRUE,
+ ];
+
+ $element['timeout'] = [
+ '#type' => 'number',
+ '#title' => $this->t('Timeout in seconds for this process.'),
+ '#default_value' => $this->getConfiguration()['timeout'],
+ '#description' => $this->t('If the process runs out of time it can still be processed again.'),
+ '#size' => 2,
+ '#maxlength' => 2,
+ '#min' => 1,
+ ];
+ $element['weight'] = [
+ '#type' => 'number',
+ '#title' => $this->t('Order or execution in the global chain.'),
+ '#default_value' => $this->getConfiguration()['weight'],
+ ];
+
+ return $element;
+ }
+
+
+
+ public function onDependencyRemoval(array $dependencies) {
+ // Since Processors could be chained we need to check if any other
+ // processor instance is using an instance of this one
+ return parent::onDependencyRemoval(
+ $dependencies
+ ); // TODO: Change the autogenerated stub
+ }
+
+ /**
+ * Executes the logic of this plugin given a file path and a context.
+ *
+ * @param \stdClass $io
+ * $io->input needs to contain
+ * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property
+ * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments
+ * $io->output will contain the result of the processor
+ * @param string $context
+ */
+ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS) {
+ // Specific input key as defined in the annotation
+ // In this case it will contain an absolute Path to a File.
+ // Needed since this executes locally on the server via SHELL.
+
+ $input_property = $this->pluginDefinition['input_property'];
+ $input_argument = $this->pluginDefinition['input_argument'];
+ $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL;
+ $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL;
+ $config = $this->getConfiguration();
+ $timeout = $config['timeout']; // in seconds
+ error_log('run OCR');
+
+ if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) {
+ // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber}
+ $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1;
+ $pageid = $node_uuid.'/canvas/'.$file_uuid.'/p'.$page_number;
+ setlocale(LC_CTYPE, 'en_US.UTF-8');
+ $execstring = $this->buildExecutableCommand($io);
+ error_log($execstring);
+ if ($execstring) {
+ $backup_locale = setlocale(LC_CTYPE, '0');
+ setlocale(LC_CTYPE, $backup_locale);
+ // Support UTF-8 commands.
+ // @see http://www.php.net/manual/en/function.shell-exec.php#85095
+ shell_exec("LANG=en_US.utf-8");
+ $output = $this->proc_execute($execstring, $timeout);
+ if (is_null($output)) {
+ throw new \Exception("Could not execute {$execstring} or timed out");
+ }
+
+ $miniocr = $this->hOCRtoMiniOCR($output, $pageid);
+ error_log($miniocr);
+ $io->output = $miniocr;
+ }
+ } else {
+ \throwException(new \InvalidArgumentException);
+ }
+ }
+
+ /**
+ * Builds a clean Command string using a File path.
+ *
+ * @param \stdClass $io
+ * $io->input needs to contain
+ * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property
+ * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments
+ * $io->output will contain the result of the processor
+ *
+ * @return null|string
+ */
+ public function buildExecutableCommand(\stdClass $io) {
+ $input_property = $this->pluginDefinition['input_property'];
+ $input_argument = $this->pluginDefinition['input_argument'];
+ // Sets the default page to 1 if not passed.
+ $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL;
+ $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1;
+ $config = $this->getConfiguration();
+ $execpath_gs = $config['path'];
+ $arguments_gs = $config['arguments'];
+ $execpath_tesseract = $config['path_tesseract'];
+ $arguments_tesseract = $config['arguments_tesseract'];
+
+ if (empty($file_path)) {
+ return NULL;
+ }
+
+ // This run function executes a 2 step function
+ //-- with r300 == 300dpi, should be configurable, etc. All should be configurable
+ // First gs -dBATCH -dNOPAUSE -sDEVICE=pnggray -r300 -dUseCropBox -sOutputFile=somepage_pagenumber.png %file
+
+ $command = '';
+ $can_run_gs = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_gs);
+ $can_run_tesseract = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_tesseract);
+ $filename = pathinfo($file_path, PATHINFO_FILENAME);
+ $sourcefolder = pathinfo($file_path,PATHINFO_DIRNAME);
+ $sourcefolder = strlen($sourcefolder)> 0 ? $sourcefolder.'/' : sys_get_temp_dir().'/';
+ $gs_destination_filename = "{$sourcefolder}{$filename}_{$page_number}.png";
+ if ($can_run_gs &&
+ $can_run_tesseract &&
+ (strpos($arguments_gs, '%file' ) !== FALSE) &&
+ (strpos($arguments_tesseract, '%file' ) !== FALSE)) {
+ $arguments_gs = "-dBATCH -dNOPAUSE -r300 -dUseCropBox -dQUIET -sDEVICE=pnggray -dFirstPage={$page_number} -dLastPage={$page_number} -sOutputFile=$gs_destination_filename " . $arguments_gs;
+ $arguments_gs = str_replace('%s','', $arguments_gs);
+ $arguments_gs = str_replace_first('%file','%s', $arguments_gs);
+ $arguments_gs = sprintf($arguments_gs, $file_path);
+
+ $arguments_tesseract = str_replace('%s','', $arguments_tesseract);
+ $arguments_tesseract = str_replace_first('%file','%s', $arguments_tesseract);
+ $arguments_tesseract = sprintf($arguments_tesseract, $gs_destination_filename);
+
+ $command_gs = escapeshellcmd($execpath_gs.' '.$arguments_gs);
+ $command_tesseract = escapeshellcmd($execpath_tesseract.' '.$arguments_tesseract);
+
+ $command = $command_gs.' && '.$command_tesseract;
+
+ } else {
+ error_log("missing arguments for OCR");
+ }
+ // Only return $command if it contains the original filepath somewhere
+ if (strpos($command, $file_path) !== false) { return $command;}
+ return '';
+
+ }
+
+ protected function hOCRtoMiniOCR($output, $pageid) {
+ error_log($output);
+ $hocr = simplexml_load_string($output);
+ $internalErrors = libxml_use_internal_errors(TRUE);
+ libxml_clear_errors();
+ libxml_use_internal_errors($internalErrors);
+ if (!$hocr) {
+ error_log('Could not convert HOCR to MiniOCR, sources is not valid XML');
+ return NULL;
+ }
+ $w = new \XMLWriter();
+ $w->openMemory();
+ $w->startDocument('1.0','UTF-8');
+ $w->startElement("ocr");
+ foreach ($hocr->body->children() as $page) {
+ $coos = explode(" ", substr($page['title'], 5));
+ if (count($coos)) {
+ $w->startElement("p");
+ $w->writeAttribute("id", $pageid);
+ $w->writeAttribute("wh", $coos[2] . " " . $coos[3]);
+ $w->startElement("b");
+ foreach ($page->children() as $line) {
+ $w->startElement("l");
+ foreach ($line->children() as $word) {
+ $wcoos = explode(" ", $word['title']);
+ if (count($wcoos)) {
+ $w->startElement("w");
+ $w->writeAttribute("x", $wcoos[1] . ' ' . $wcoos[2] . ' ' . $wcoos[3] . ' ' . $wcoos[4]);
+ error_log($word->__toString());
+ $w->text($word->__toString());
+ $w->endElement();
+ }
+ }
+ $w->endElement();
+ }
+ $w->endElement();
+ $w->endElement();
+ }
+ }
+ $w->endElement();
+ $w->endDocument();
+ unset($hocr);
+ return $w->outputMemory(true);
+ }
+
+
+
+
+}