From d0d4a6d214a5f30f6e610d7c4c8084e7c72fadb5 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 22 Nov 2020 23:22:08 -0500 Subject: [PATCH 01/24] proc_open and killing of processes taking more than X seconds - This adds the ability to kill processes that are taking more time //@TODO each processor needs to also pass back a $io->output->garbage so we can clean up any left over files --- ...rawberryRunnersPostProcessorPluginBase.php | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php index ac8ffb6..13c6fe7 100644 --- a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php +++ b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php @@ -133,7 +133,31 @@ public function run(\stdClass $io, $context = strawberryRunnersPostProcessorPlug return FALSE; } + protected function proc_execute($command, $timeout = 5) { + $handle = proc_open($command, [['pipe', 'r'], ['pipe', 'w'], ['pipe', 'w']], $pipe); + $startTime = microtime(true); + $read = NULL; + /* Read the command output and kill it if the proccess surpassed the timeout */ + while(!feof($pipe[1])) { + $read .= fread($pipe[1], 8192); + if($startTime + $timeout < microtime(true)) { + $read = NULL; + break; + } + } + $status = proc_get_status($handle); + error_log(var_export($status,true)); + $this->kill($status['pid']); + proc_close($handle); + + return $read; + } + + /* The proc_terminate() function doesn't end proccess properly on Windows */ + protected function kill($pid) { + return strstr(PHP_OS, 'WIN') ? exec("taskkill /F /T /PID $pid") : exec("kill -9 $pid"); + } -} \ No newline at end of file +} From 0010de207c4ba8f0a3793a5c6675ce44c387eee1 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 22 Nov 2020 23:24:56 -0500 Subject: [PATCH 02/24] Moves Processing from PreSave to PostSave because we need the node->id()... i always testing with existing ones. This now happens all when all is already saved. So we do not push data back into the node. We may want to have a PreSave one too to read flags like "force process" and then delete them before all gets saved. --- ...nersEventSavePostProcessingSubscriber.php} | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) rename src/EventSubscriber/{StrawberryRunnersEventPreSavePostProcessingSubscriber.php => StrawberryRunnersEventSavePostProcessingSubscriber.php} (90%) diff --git a/src/EventSubscriber/StrawberryRunnersEventPreSavePostProcessingSubscriber.php b/src/EventSubscriber/StrawberryRunnersEventSavePostProcessingSubscriber.php similarity index 90% rename from src/EventSubscriber/StrawberryRunnersEventPreSavePostProcessingSubscriber.php rename to src/EventSubscriber/StrawberryRunnersEventSavePostProcessingSubscriber.php index 153fbe2..baa2962 100644 --- a/src/EventSubscriber/StrawberryRunnersEventPreSavePostProcessingSubscriber.php +++ b/src/EventSubscriber/StrawberryRunnersEventSavePostProcessingSubscriber.php @@ -5,7 +5,6 @@ use Drupal\Core\Entity\EntityTypeManagerInterface; use Drupal\Core\Session\AccountInterface; use Drupal\strawberryfield\Event\StrawberryfieldCrudEvent; -use Drupal\strawberryfield\EventSubscriber\StrawberryfieldEventPresaveSubscriber; use Drupal\Core\StringTranslation\StringTranslationTrait; use Drupal\Core\StringTranslation\TranslationInterface; use Drupal\Core\Messenger\MessengerInterface; @@ -14,14 +13,14 @@ use Drupal\Component\Utility\Unicode; use Drupal\file\FileInterface; use Drupal\Core\StreamWrapper\StreamWrapperManagerInterface; -use Drupal\Core\StreamWrapper\StreamWrapperInterface; use Drupal\Core\File\FileSystemInterface; use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginManager; +use Drupal\strawberryfield\EventSubscriber\StrawberryfieldEventSaveSubscriber; /** * Event subscriber for SBF bearing entity json process event. */ -class StrawberryRunnersEventPreSavePostProcessingSubscriber extends StrawberryfieldEventPresaveSubscriber { +class StrawberryRunnersEventSavePostProcessingSubscriber extends StrawberryfieldEventSaveSubscriber { use StringTranslationTrait; @@ -150,7 +149,7 @@ public function __construct( * @throws \Drupal\Component\Plugin\Exception\PluginException * @throws \Drupal\Component\Plugin\Exception\PluginNotFoundException */ - public function onEntityPresave(StrawberryfieldCrudEvent $event) { + public function onEntitySave(StrawberryfieldCrudEvent $event) { /* @var $plugin_config_entities \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity[] */ $plugin_config_entities = $this->entityTypeManager->getListBuilder('strawberry_runners_postprocessor')->load(); @@ -232,9 +231,9 @@ public function onEntityPresave(StrawberryfieldCrudEvent $event) { /** @var $itemfield \Drupal\strawberryfield\Plugin\Field\FieldType\StrawberryFieldItem */ $flatvalues = (array) $itemfield->provideFlatten(); // Run first on entity:files - $sbf_type = NULL; + $sbf_type = []; if (isset($flatvalues['type'])) { - $sbf_type = $flatvalues['type']; + $sbf_type = (array) $flatvalues['type']; } foreach ($askeymap as $jsonkey => $activePlugins) { if (isset($flatvalues[$jsonkey])) { @@ -243,12 +242,16 @@ public function onEntityPresave(StrawberryfieldCrudEvent $event) { foreach($activePlugins as $activePluginId => $config) { $valid_mimes = []; - if (empty($config['ado_type']) || in_array($config['ado_type'] , $sbf_type)) { + //@TODO also split $config['ado_type'] so we can check + $valid_ado_type = []; + $valid_ado_type = explode(',', $config['ado_type']); + if (empty($config['ado_type']) || count(array_intersect($valid_ado_type , $sbf_type)) > 0) { $valid_mimes = explode(',', $config['mime_type']); if (empty($valid_mimes) || (isset($asstructure["dr:mimetype"]) && in_array($asstructure["dr:mimetype"], $valid_mimes))) { $data = new \stdClass(); $data->fid = $asstructure['dr:fid']; $data->nid = $entity->id(); + $data->nuuid = $entity->uuid(); // We are passing also the full file metadata. // This gives us an advantage so we can reuse // Sequence IDs, PDF pages, etc and act on them @@ -260,6 +263,16 @@ public function onEntityPresave(StrawberryfieldCrudEvent $event) { // $activePluginId? That would allow us to skip reprocessing // Easier? $data->metadata = $asstructure; + + // @TODO how to force? + // Can be a state key, valuekey, or a JSON passed property. + // Issue with JSON passed property is that we can no longer + // Here modify it (Entity is saved) + // So we should really better have a non Metadata method for this + // Or/ we can have a preSave Subscriber that reads the prop, + // sets the state and then removes if before saving + + $data->force = FALSE; $data->plugin_config_entity_id = $activePluginId; // See https://github.com/esmero/strawberry_runners/issues/10 // Since the destination Queue can be a modal thing From b332c5954084fd6d148366e3ab404068185b9a40 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 22 Nov 2020 23:25:48 -0500 Subject: [PATCH 03/24] Updates processor annotations (this may change, i feel its a bit complex still) And enforces also time based kill -9 --- config/schema/strawberry_runners.schema.yml | 47 +++++++++++++++++++ .../StrawberryRunnersPostProcessor.php | 9 +++- .../SystemBinaryPostProcessor.php | 41 +++++++++++----- 3 files changed, 83 insertions(+), 14 deletions(-) diff --git a/config/schema/strawberry_runners.schema.yml b/config/schema/strawberry_runners.schema.yml index 8561a68..074595b 100644 --- a/config/schema/strawberry_runners.schema.yml +++ b/config/schema/strawberry_runners.schema.yml @@ -31,6 +31,44 @@ strawberryfield_runners.strawberry_runners_postprocessor.*: strawberryfield_runners.strawberry_runners_postprocessor.binary: type: config_object label: 'Strawberry Runners Post Processor Config Entity Binary specific config' + mapping: + source_type: + type: string + label: 'The type of Source Data this Processor works on' + ado_type: + type: string + label: 'DO type(s) to limit this Processor to' + jsonkey: + type: sequence + label: 'The JSON key(s) containing the desired Source File(s)' + sequence: + - type: string + mime_type: + type: string + label: 'Mimetypes(s) to limit this Processor to' + path: + type: string + label: 'The path for he binary to execute' + arguments: + type: string + label: 'Any additional argument your executable binary requires' + output_type: + type: string + label: 'The expected and desired output of this processor' + output_destination: + type: sequence + label: 'Where and how the output will be used' + sequence: + - type: string + timeout: + type: integer + label: 'Timeout in seconds for this process' + weight: + type: integer + label: 'Order or execution in the global chain' +strawberryfield_runners.strawberry_runners_postprocessor.ocr: + type: config_object + label: 'Strawberry Runners Post Processor Config Entity OCR specific config' mapping: source_type: type: string @@ -49,6 +87,15 @@ strawberryfield_runners.strawberry_runners_postprocessor.binary: arguments: type: string label: 'Any additional argument your executable binary requires' + tesseract_arguments: + type: string + label: 'Any additional argument your executable binary requires' + path: + type: string + label: 'The path for he binary to execute' + tesseract_path: + type: string + label: 'The path for he binary to execute' output_type: type: string label: 'The expected and desired output of this processor' diff --git a/src/Annotation/StrawberryRunnersPostProcessor.php b/src/Annotation/StrawberryRunnersPostProcessor.php index 0e9ae67..3b5a3bf 100644 --- a/src/Annotation/StrawberryRunnersPostProcessor.php +++ b/src/Annotation/StrawberryRunnersPostProcessor.php @@ -55,6 +55,13 @@ class StrawberryRunnersPostProcessor extends Plugin { */ public $input_property; + /** + * The Object property that contains the additional data needed by the Processor ::run method + * + * @var string $input_arguments; + * + */ + public $input_arguments; /** * Processing stage: can be Entity PreSave or Index time search_api @@ -64,4 +71,4 @@ class StrawberryRunnersPostProcessor extends Plugin { */ public $when = StrawberryRunnersPostProcessor::PRESAVE; -} \ No newline at end of file +} diff --git a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php index aeb8cd3..266199e 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php @@ -22,7 +22,8 @@ * id = "binary", * label = @Translation("Post processor that uses a System Binary to process files"), * input_type = "entity:file", - * input_property = "filepath" + * input_property = "filepath", + * input_argument = NULL * ) */ class SystemBinaryPostProcessor extends StrawberryRunnersPostProcessorPluginBase{ @@ -66,7 +67,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#type' => 'textfield', '#title' => $this->t('ADO type(s) to limit this processor to.'), '#default_value' => $this->getConfiguration()['ado_type'], - '#description' => $this->t('A single ADO type or a coma separed list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'), + '#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'), ]; $element['jsonkey'] = [ @@ -167,7 +168,9 @@ public function onDependencyRemoval(array $dependencies) { * Executes the logic of this plugin given a file path and a context. * * @param \stdClass $io - * $io->input needs to contain \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * $io->input needs to contain property and the arguments if any + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments * $io->output will contain the result of the processor * @param string $context */ @@ -176,11 +179,16 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug // In this case it will contain an absolute Path to a File. // Needed since this executes locally on the server via SHELL. $input_property = $this->pluginDefinition['input_property']; - error_log('run'); + $input_argument = $this->pluginDefinition['input_arguments']; + // NOT user here? + $config = $this->getConfiguration(); + $timeout = $config['timeout']; // in seconds + // TODO how do we map $input_argument to the callable executable binary? + error_log('run system binary'); error_log($io->input->{$input_property}); if (isset($io->input->{$input_property})) { setlocale(LC_CTYPE, 'en_US.UTF-8'); - $execstring = $this->buildExecutableCommand($io->input->{$input_property}); + $execstring = $this->buildExecutableCommand($io); error_log($execstring); if ($execstring) { $backup_locale = setlocale(LC_CTYPE, '0'); @@ -188,12 +196,12 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug // Support UTF-8 commands. // @see http://www.php.net/manual/en/function.shell-exec.php#85095 shell_exec("LANG=en_US.utf-8"); - $output = shell_exec($execstring); + //$output = shell_exec($execstring); + $output = $this->proc_execute($execstring, $timeout); if (is_null($output)) { - throw new \Exception("Could not execute {$execstring}"); + throw new \Exception("Could not execute {$execstring} or timed out"); } $io->output = $output; - } } else { \throwException(new \InvalidArgumentException); @@ -203,29 +211,36 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug /** * Builds a clean Command string using a File path. * - * @param string $filepath + * @param \stdClass $io * * @return null|string */ - public function buildExecutableCommand(string $filepath) { + public function buildExecutableCommand(\stdClass $io) { $config = $this->getConfiguration(); $execpath = $config['path']; $arguments = $config['arguments']; $command = ''; + $input_property = $this->pluginDefinition['input_property']; + $input_argument = $this->pluginDefinition['input_argument']; + // Sets the default page to 1 if not passed. + $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; error_log('verify!'.(int) \Drupal::service('strawberryfield.utility')->verifyCommand($execpath)); + if (empty($file_path)) { + return NULL; + } if (\Drupal::service('strawberryfield.utility')->verifyCommand($execpath) && (strpos($arguments, '%file' ) !== FALSE)) { error_log('its a command, well well'); $arguments = str_replace('%s','', $arguments); $arguments = str_replace_first('%file','%s', $arguments); - $arguments = sprintf($arguments, $filepath); + $arguments = sprintf($arguments, $file_path); error_log($arguments); $command = escapeshellcmd($execpath.' '.$arguments); error_log($command); } // Only return $command if it contains the original filepath somewhere - if (strpos($command, $filepath) !== false) { return $command;} + if (strpos($command, $file_path) !== false) { return $command;} return ''; } @@ -244,4 +259,4 @@ private function verifyCommand($execpath) :bool { return is_executable(shell_exec("$test $execpath")); } -} \ No newline at end of file +} From 618a56c59ed7ba2c6e68c592def0ea90775e6605 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 22 Nov 2020 23:26:14 -0500 Subject: [PATCH 04/24] Updates Service to run on PostSave @giancarlobi you may need a drush cr before trying this out --- strawberry_runners.services.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/strawberry_runners.services.yml b/strawberry_runners.services.yml index fed6810..7a7b337 100644 --- a/strawberry_runners.services.yml +++ b/strawberry_runners.services.yml @@ -3,7 +3,7 @@ services: class: Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginManager parent: default_plugin_manager strawberry_runner.postprocessing_subscriber: - class: Drupal\strawberry_runners\EventSubscriber\StrawberryRunnersEventPreSavePostProcessingSubscriber + class: Drupal\strawberry_runners\EventSubscriber\StrawberryRunnersEventSavePostProcessingSubscriber tags: - {name: event_subscriber} arguments: ['@string_translation', '@messenger', '@logger.factory', '@config.factory', '@stream_wrapper_manager', '@file_system', '@entity_type.manager', '@strawberry_runner.processor_manager', '@current_user'] From e144f71dc697662ba7f6895f24a492d4da839e24 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 22 Nov 2020 23:27:20 -0500 Subject: [PATCH 05/24] So many updates on our Queue Worker Many conditionals and double false/sanity checks before even trying to run We pass more data, we get more data back we push more data This depends @giancarlobi on https://github.com/esmero/strawberryfield/pull/113 --- .../IndexPostProcessorQueueWorker.php | 65 ++++++++++++------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php index 2317f3d..ba18063 100644 --- a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php @@ -159,12 +159,13 @@ public function processItem($data) { $processor_instance = $this->getProcessorPlugin($data->plugin_config_entity_id); - if (!isset($data->fid) || $data->fid == NULL || !isset($data->nid) || $data->nid == NULL) { + if (!isset($data->fid) || $data->fid == NULL || !isset($data->nid) || $data->nid == NULL || !is_array($data->metadata)) { return; } $file = $this->entityTypeManager->getStorage('file')->load($data->fid); - if ($file === NULL) { + if ($file === NULL || !isset($data->metadata['checksum'])) { + error_log('Sorry the file does not exist or has no checksum yet. We really need the checksum'); return; } //@TODO should we wrap this around a try catch? @@ -188,37 +189,53 @@ public function processItem($data) { // Skip file if element is found in key_value collection. $processed_data = $this->keyValue->get($keyvalue_collection)->get($key); - - if (empty($processed_data)) { + error_log('Is this already in our temp keyValue?'); + error_log(empty($processed_data)); + //@TODO allow a force in case of corrupted key value? Partial output + // Extragenous weird data? + if (empty($processed_data) || + $data->force == TRUE || + (!isset($processed_data->checksum) || + empty($processed_data->checksum) || + $processed_data->checksum != $data->metadata['checksum'])) { // Extract file and save it in key_value collection. $io = new \stdClass(); $input = new \stdClass(); $input->filepath = $filelocation; - + $input->page_number = 1; + // The Node UUID + $input->nuuid = $data->nuuid; + // All the rest of the associated Metadata in an as:structure + $input->metadata = $data->metadata; $io->input = $input; $io->output = NULL; + //@TODO implement the TEST and BENCHMARK logic here + // RUN should return exit codes so we can know if something failed + // And totally discard indexing. $extracted_data = $processor_instance->run($io, StrawberryRunnersPostProcessorPluginInterface::PROCESS); error_log ('processing just run'); - error_log($io->ouput); error_log('writing to keyvalue'); error_log($key); - $this->keyValue->get($keyvalue_collection)->set($key, $io->output); - } - - // Get which indexes have our StrawberryfieldFlavorDatasource enabled! - $indexes = StrawberryfieldFlavorDatasource::getValidIndexes(); - - $item_ids = []; - if (is_a($entity, TranslatableInterface::class)) { - $translations = $entity->getTranslationLanguages(); - foreach ($translations as $translation_id => $translation) { - $item_ids[] = $entity->id() . ':'.'1' .':'.$translation_id.':'.$file->uuid().':'.$data->plugin_config_entity_id; + $toindex = new \stdClass(); + $toindex->fulltext = $io->output; + $toindex->checksum = $data->metadata['checksum']; + $this->keyValue->get($keyvalue_collection)->set($key, $toindex); + + // Get which indexes have our StrawberryfieldFlavorDatasource enabled! + $indexes = StrawberryfieldFlavorDatasource::getValidIndexes(); + + $item_ids = []; + if (is_a($entity, TranslatableInterface::class)) { + $translations = $entity->getTranslationLanguages(); + foreach ($translations as $translation_id => $translation) { + $item_ids[] = $entity->id() . ':'.'1' .':'.$translation_id.':'.$file->uuid().':'.$data->plugin_config_entity_id; + } + } + error_log(var_export($item_ids,true)); + $datasource_id = 'strawberryfield_flavor_datasource'; + foreach ($indexes as $index) { + $index->trackItemsInserted($datasource_id, $item_ids); } - } - error_log(var_export($item_ids,true)); - $datasource_id = 'strawberryfield_flavor_datasource'; - foreach ($indexes as $index) { - $index->trackItemsUpdated($datasource_id, $item_ids); } } catch (\Exception $exception) { @@ -252,7 +269,7 @@ private function ensureFileAvailability(FileInterface $file) { // Check first if the file is already around in temp? // @TODO can be sure its the same one? Ideas? if (is_readable( - $this->fileSystem->realpath( + $this->fileSystem->realpath( 'temporary://sbr_' . $cache_key . '_' . basename($uri) ) )) { @@ -306,4 +323,4 @@ public function getRealpath($uri) { } } -} \ No newline at end of file +} From 5c6fa9f80a689d34a2e7d6b26f2187cca765380b Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 23 Nov 2020 00:19:43 -0500 Subject: [PATCH 06/24] First pass on OcrPostProcessor miniOCR code i tried to adapt is not working yet. This needs anther processor in chain that passes (invokes this as child) once per page. The idea is that each processor is Atomic. So this one only deals with a single page. Too tired to keep coding. Let's see if i figure out what is missing from the XML transformation. Probably something stupid on my side. I'm also having now with this trouble reading the value back. Its a string, but PHP on decode things its an object.. gosh --- .../IndexPostProcessorQueueWorker.php | 3 +- .../OcrPostProcessor.php | 350 ++++++++++++++++++ 2 files changed, 352 insertions(+), 1 deletion(-) create mode 100644 src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php diff --git a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php index ba18063..6d1c715 100644 --- a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php @@ -193,7 +193,7 @@ public function processItem($data) { error_log(empty($processed_data)); //@TODO allow a force in case of corrupted key value? Partial output // Extragenous weird data? - if (empty($processed_data) || + if (true || empty($processed_data) || $data->force == TRUE || (!isset($processed_data->checksum) || empty($processed_data->checksum) || @@ -219,6 +219,7 @@ public function processItem($data) { $toindex = new \stdClass(); $toindex->fulltext = $io->output; $toindex->checksum = $data->metadata['checksum']; + error_log(var_export($toindex,true)); $this->keyValue->get($keyvalue_collection)->set($key, $toindex); // Get which indexes have our StrawberryfieldFlavorDatasource enabled! diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php new file mode 100644 index 0000000..1430be0 --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -0,0 +1,350 @@ + 'asstructure', + 'mime_type' => ['application/pdf'], + 'path' => '', + 'path_tesseract' => '', + 'arguments' => '', + 'arguments_tesseract' => '', + 'output_type' => 'json', + 'output_destination' => 'subkey', + ] + parent::defaultConfiguration(); + } + + + public function calculateDependencies() { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + // @TODO: Implement calculateDependencies() method. + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', + 'filepath' => 'Full file paths passed by another processor', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source file this processor needs is fetched'), + '#required' => TRUE + ]; + + $element['ado_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('ADO type(s) to limit this processor to.'), + '#default_value' => $this->getConfiguration()['ado_type'], + '#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'), + ]; + + $element['jsonkey'] = [ + '#type' => 'checkboxes', + '#title' => $this->t('The JSON key that contains the desired source files.'), + '#options' => [ + 'as:image' => 'as:image', + 'as:document' => 'as:document', + 'as:audio' => 'as:audio', + 'as:video' => 'as:video', + 'as:text' => 'as:text', + 'as:application' => 'as:application', + ], + '#default_value' => (!empty($this->getConfiguration()['jsonkey']) && is_array($this->getConfiguration()['jsonkey'])) ? $this->getConfiguration()['jsonkey'] : [], + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[source_type]"]' => ['value' => 'asstructure'], + ], + ], + '#required' => TRUE, + ]; + + $element['mime_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('Mimetypes(s) to limit this Processor to.'), + '#default_value' => $this->getConfiguration()['mime_type'], + '#description' => $this->t('A single Mimetype type or a coma separed list of mimetypes that qualify to be Processed. Leave empty to apply any file'), + ]; + $element['path'] = [ + '#type' => 'textfield', + '#title' => $this->t('The system path to the ghostscript (gs) binary that will be executed by this processor.'), + '#default_value' => $this->getConfiguration()['path'], + '#description' => t('A full system path to the gs binary present in the same environment your PHP runs, e.g /usr/bin/gs'), + '#required' => TRUE, + ]; + + $element['arguments'] = [ + '#type' => 'textfield', + '#title' => $this->t('Any additional argument your executable binary requires.'), + '#default_value' => !empty($this->getConfiguration()['arguments']) ? $this->getConfiguration()['arguments'] : '%file', + '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file if the executable requires the filename to be passed under a specific argument.'), + '#required' => TRUE, + ]; + + + $element['path_tesseract'] = [ + '#type' => 'textfield', + '#title' => $this->t('The system path to the Tesseract binary that will be executed by this processor.'), + '#default_value' => $this->getConfiguration()['path_tesseract'], + '#description' => t('A full system path to the Tesseract binary present in the same environment your PHP runs, e.g /usr/bin/tesseract'), + '#required' => TRUE, + ]; + + $element['arguments_tesseract'] = [ + '#type' => 'textfield', + '#title' => $this->t('Any additional argument for your tesseract binary.'), + '#default_value' => !empty($this->getConfiguration()['arguments_tesseract']) ? $this->getConfiguration()['arguments_tesseract'] : '%file', + '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file that is output but the GS binary.'), + '#required' => TRUE, + ]; + + $element['output_type'] = [ + '#type' => 'select', + '#title' => $this->t('The expected and desired output of this processor.'), + '#options' => [ + 'entity:file' => 'One or more Files', + 'json' => 'Data/Values that can be serialized to JSON', + ], + '#default_value' => $this->getConfiguration()['output_type'], + '#description' => $this->t('If the output is just data and "One or more Files" is selected all data will be dumped into a file and handled as such.'), + ]; + + $element['output_destination'] = [ + '#type' => 'checkboxes', + '#title' => $this->t("Where and how the output will be used."), + '#options' => [ + 'subkey' => 'In the same Source Metadata, as a child structure of each Processed file', + 'ownkey' => 'In the same Source Metadata but inside its own, top level, "as:flavour" subkey based on the given machine name of the current plugin', + 'plugin' => 'As Input for another processor Plugin', + ], + '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination']))? $this->getConfiguration()['output_destination']: [], + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#required' => TRUE, + ]; + + $element['timeout'] = [ + '#type' => 'number', + '#title' => $this->t('Timeout in seconds for this process.'), + '#default_value' => $this->getConfiguration()['timeout'], + '#description' => $this->t('If the process runs out of time it can still be processed again.'), + '#size' => 2, + '#maxlength' => 2, + '#min' => 1, + ]; + $element['weight'] = [ + '#type' => 'number', + '#title' => $this->t('Order or execution in the global chain.'), + '#default_value' => $this->getConfiguration()['weight'], + ]; + + return $element; + } + + + + public function onDependencyRemoval(array $dependencies) { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + return parent::onDependencyRemoval( + $dependencies + ); // TODO: Change the autogenerated stub + } + + /** + * Executes the logic of this plugin given a file path and a context. + * + * @param \stdClass $io + * $io->input needs to contain + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments + * $io->output will contain the result of the processor + * @param string $context + */ + public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS) { + // Specific input key as defined in the annotation + // In this case it will contain an absolute Path to a File. + // Needed since this executes locally on the server via SHELL. + + $input_property = $this->pluginDefinition['input_property']; + $input_argument = $this->pluginDefinition['input_argument']; + $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL; + $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; + $config = $this->getConfiguration(); + $timeout = $config['timeout']; // in seconds + error_log('run OCR'); + + if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { + // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} + $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + $pageid = $node_uuid.'/canvas/'.$file_uuid.'/p'.$page_number; + setlocale(LC_CTYPE, 'en_US.UTF-8'); + $execstring = $this->buildExecutableCommand($io); + error_log($execstring); + if ($execstring) { + $backup_locale = setlocale(LC_CTYPE, '0'); + setlocale(LC_CTYPE, $backup_locale); + // Support UTF-8 commands. + // @see http://www.php.net/manual/en/function.shell-exec.php#85095 + shell_exec("LANG=en_US.utf-8"); + $output = $this->proc_execute($execstring, $timeout); + if (is_null($output)) { + throw new \Exception("Could not execute {$execstring} or timed out"); + } + + $miniocr = $this->hOCRtoMiniOCR($output, $pageid); + error_log($miniocr); + $io->output = $miniocr; + } + } else { + \throwException(new \InvalidArgumentException); + } + } + + /** + * Builds a clean Command string using a File path. + * + * @param \stdClass $io + * $io->input needs to contain + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments + * $io->output will contain the result of the processor + * + * @return null|string + */ + public function buildExecutableCommand(\stdClass $io) { + $input_property = $this->pluginDefinition['input_property']; + $input_argument = $this->pluginDefinition['input_argument']; + // Sets the default page to 1 if not passed. + $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; + $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + $config = $this->getConfiguration(); + $execpath_gs = $config['path']; + $arguments_gs = $config['arguments']; + $execpath_tesseract = $config['path_tesseract']; + $arguments_tesseract = $config['arguments_tesseract']; + + if (empty($file_path)) { + return NULL; + } + + // This run function executes a 2 step function + //-- with r300 == 300dpi, should be configurable, etc. All should be configurable + // First gs -dBATCH -dNOPAUSE -sDEVICE=pnggray -r300 -dUseCropBox -sOutputFile=somepage_pagenumber.png %file + + $command = ''; + $can_run_gs = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_gs); + $can_run_tesseract = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_tesseract); + $filename = pathinfo($file_path, PATHINFO_FILENAME); + $sourcefolder = pathinfo($file_path,PATHINFO_DIRNAME); + $sourcefolder = strlen($sourcefolder)> 0 ? $sourcefolder.'/' : sys_get_temp_dir().'/'; + $gs_destination_filename = "{$sourcefolder}{$filename}_{$page_number}.png"; + if ($can_run_gs && + $can_run_tesseract && + (strpos($arguments_gs, '%file' ) !== FALSE) && + (strpos($arguments_tesseract, '%file' ) !== FALSE)) { + $arguments_gs = "-dBATCH -dNOPAUSE -r300 -dUseCropBox -dQUIET -sDEVICE=pnggray -dFirstPage={$page_number} -dLastPage={$page_number} -sOutputFile=$gs_destination_filename " . $arguments_gs; + $arguments_gs = str_replace('%s','', $arguments_gs); + $arguments_gs = str_replace_first('%file','%s', $arguments_gs); + $arguments_gs = sprintf($arguments_gs, $file_path); + + $arguments_tesseract = str_replace('%s','', $arguments_tesseract); + $arguments_tesseract = str_replace_first('%file','%s', $arguments_tesseract); + $arguments_tesseract = sprintf($arguments_tesseract, $gs_destination_filename); + + $command_gs = escapeshellcmd($execpath_gs.' '.$arguments_gs); + $command_tesseract = escapeshellcmd($execpath_tesseract.' '.$arguments_tesseract); + + $command = $command_gs.' && '.$command_tesseract; + + } else { + error_log("missing arguments for OCR"); + } + // Only return $command if it contains the original filepath somewhere + if (strpos($command, $file_path) !== false) { return $command;} + return ''; + + } + + protected function hOCRtoMiniOCR($output, $pageid) { + error_log($output); + $hocr = simplexml_load_string($output); + $internalErrors = libxml_use_internal_errors(TRUE); + libxml_clear_errors(); + libxml_use_internal_errors($internalErrors); + if (!$hocr) { + error_log('Could not convert HOCR to MiniOCR, sources is not valid XML'); + return NULL; + } + $w = new \XMLWriter(); + $w->openMemory(); + $w->startDocument('1.0','UTF-8'); + $w->startElement("ocr"); + foreach ($hocr->body->children() as $page) { + $coos = explode(" ", substr($page['title'], 5)); + if (count($coos)) { + $w->startElement("p"); + $w->writeAttribute("id", $pageid); + $w->writeAttribute("wh", $coos[2] . " " . $coos[3]); + $w->startElement("b"); + foreach ($page->children() as $line) { + $w->startElement("l"); + foreach ($line->children() as $word) { + $wcoos = explode(" ", $word['title']); + if (count($wcoos)) { + $w->startElement("w"); + $w->writeAttribute("x", $wcoos[1] . ' ' . $wcoos[2] . ' ' . $wcoos[3] . ' ' . $wcoos[4]); + error_log($word->__toString()); + $w->text($word->__toString()); + $w->endElement(); + } + } + $w->endElement(); + } + $w->endElement(); + $w->endElement(); + } + } + $w->endElement(); + $w->endDocument(); + unset($hocr); + return $w->outputMemory(true); + } + + + + +} From 391101bf29f89f4ac356256245773e1c25c55543 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 23 Nov 2020 11:23:38 -0500 Subject: [PATCH 07/24] OK. I got XPath is working It was a namespace (ns) @giancarlobi ready to test --- .../OcrPostProcessor.php | 162 ++++++++++-------- 1 file changed, 91 insertions(+), 71 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index 1430be0..462f06f 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -34,15 +34,15 @@ class OcrPostProcessor extends SystemBinaryPostProcessor { */ public function defaultConfiguration() { return [ - 'source_type' => 'asstructure', - 'mime_type' => ['application/pdf'], - 'path' => '', - 'path_tesseract' => '', - 'arguments' => '', - 'arguments_tesseract' => '', - 'output_type' => 'json', - 'output_destination' => 'subkey', - ] + parent::defaultConfiguration(); + 'source_type' => 'asstructure', + 'mime_type' => ['application/pdf'], + 'path' => '', + 'path_tesseract' => '', + 'arguments' => '', + 'arguments_tesseract' => '', + 'output_type' => 'json', + 'output_destination' => 'subkey', + ] + parent::defaultConfiguration(); } @@ -82,14 +82,14 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'as:audio' => 'as:audio', 'as:video' => 'as:video', 'as:text' => 'as:text', - 'as:application' => 'as:application', + 'as:application' => 'as:application', ], '#default_value' => (!empty($this->getConfiguration()['jsonkey']) && is_array($this->getConfiguration()['jsonkey'])) ? $this->getConfiguration()['jsonkey'] : [], '#states' => [ 'visible' => [ ':input[name="pluginconfig[source_type]"]' => ['value' => 'asstructure'], - ], ], + ], '#required' => TRUE, ]; @@ -108,11 +108,11 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { ]; $element['arguments'] = [ - '#type' => 'textfield', - '#title' => $this->t('Any additional argument your executable binary requires.'), - '#default_value' => !empty($this->getConfiguration()['arguments']) ? $this->getConfiguration()['arguments'] : '%file', - '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file if the executable requires the filename to be passed under a specific argument.'), - '#required' => TRUE, + '#type' => 'textfield', + '#title' => $this->t('Any additional argument your executable binary requires.'), + '#default_value' => !empty($this->getConfiguration()['arguments']) ? $this->getConfiguration()['arguments'] : '%file', + '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file if the executable requires the filename to be passed under a specific argument.'), + '#required' => TRUE, ]; @@ -151,20 +151,20 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'ownkey' => 'In the same Source Metadata but inside its own, top level, "as:flavour" subkey based on the given machine name of the current plugin', 'plugin' => 'As Input for another processor Plugin', ], - '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination']))? $this->getConfiguration()['output_destination']: [], + '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), '#required' => TRUE, ]; - $element['timeout'] = [ - '#type' => 'number', - '#title' => $this->t('Timeout in seconds for this process.'), - '#default_value' => $this->getConfiguration()['timeout'], - '#description' => $this->t('If the process runs out of time it can still be processed again.'), - '#size' => 2, - '#maxlength' => 2, - '#min' => 1, - ]; + $element['timeout'] = [ + '#type' => 'number', + '#title' => $this->t('Timeout in seconds for this process.'), + '#default_value' => $this->getConfiguration()['timeout'], + '#description' => $this->t('If the process runs out of time it can still be processed again.'), + '#size' => 2, + '#maxlength' => 2, + '#min' => 1, + ]; $element['weight'] = [ '#type' => 'number', '#title' => $this->t('Order or execution in the global chain.'), @@ -175,7 +175,6 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { } - public function onDependencyRemoval(array $dependencies) { // Since Processors could be chained we need to check if any other // processor instance is using an instance of this one @@ -199,8 +198,8 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug // In this case it will contain an absolute Path to a File. // Needed since this executes locally on the server via SHELL. - $input_property = $this->pluginDefinition['input_property']; - $input_argument = $this->pluginDefinition['input_argument']; + $input_property = $this->pluginDefinition['input_property']; + $input_argument = $this->pluginDefinition['input_argument']; $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL; $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; $config = $this->getConfiguration(); @@ -210,7 +209,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; - $pageid = $node_uuid.'/canvas/'.$file_uuid.'/p'.$page_number; + $pageid = $node_uuid . '/canvas/' . $file_uuid . '/p' . $page_number; setlocale(LC_CTYPE, 'en_US.UTF-8'); $execstring = $this->buildExecutableCommand($io); error_log($execstring); @@ -227,9 +226,10 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $miniocr = $this->hOCRtoMiniOCR($output, $pageid); error_log($miniocr); - $io->output = $miniocr; + $io->output = $miniocr; } - } else { + } + else { \throwException(new \InvalidArgumentException); } } @@ -246,8 +246,8 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug * @return null|string */ public function buildExecutableCommand(\stdClass $io) { - $input_property = $this->pluginDefinition['input_property']; - $input_argument = $this->pluginDefinition['input_argument']; + $input_property = $this->pluginDefinition['input_property']; + $input_argument = $this->pluginDefinition['input_argument']; // Sets the default page to 1 if not passed. $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; @@ -266,35 +266,40 @@ public function buildExecutableCommand(\stdClass $io) { // First gs -dBATCH -dNOPAUSE -sDEVICE=pnggray -r300 -dUseCropBox -sOutputFile=somepage_pagenumber.png %file $command = ''; - $can_run_gs = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_gs); - $can_run_tesseract = \Drupal::service('strawberryfield.utility')->verifyCommand($execpath_tesseract); + $can_run_gs = \Drupal::service('strawberryfield.utility') + ->verifyCommand($execpath_gs); + $can_run_tesseract = \Drupal::service('strawberryfield.utility') + ->verifyCommand($execpath_tesseract); $filename = pathinfo($file_path, PATHINFO_FILENAME); - $sourcefolder = pathinfo($file_path,PATHINFO_DIRNAME); - $sourcefolder = strlen($sourcefolder)> 0 ? $sourcefolder.'/' : sys_get_temp_dir().'/'; + $sourcefolder = pathinfo($file_path, PATHINFO_DIRNAME); + $sourcefolder = strlen($sourcefolder) > 0 ? $sourcefolder . '/' : sys_get_temp_dir() . '/'; $gs_destination_filename = "{$sourcefolder}{$filename}_{$page_number}.png"; if ($can_run_gs && $can_run_tesseract && - (strpos($arguments_gs, '%file' ) !== FALSE) && - (strpos($arguments_tesseract, '%file' ) !== FALSE)) { + (strpos($arguments_gs, '%file') !== FALSE) && + (strpos($arguments_tesseract, '%file') !== FALSE)) { $arguments_gs = "-dBATCH -dNOPAUSE -r300 -dUseCropBox -dQUIET -sDEVICE=pnggray -dFirstPage={$page_number} -dLastPage={$page_number} -sOutputFile=$gs_destination_filename " . $arguments_gs; - $arguments_gs = str_replace('%s','', $arguments_gs); - $arguments_gs = str_replace_first('%file','%s', $arguments_gs); + $arguments_gs = str_replace('%s', '', $arguments_gs); + $arguments_gs = str_replace_first('%file', '%s', $arguments_gs); $arguments_gs = sprintf($arguments_gs, $file_path); - $arguments_tesseract = str_replace('%s','', $arguments_tesseract); - $arguments_tesseract = str_replace_first('%file','%s', $arguments_tesseract); + $arguments_tesseract = str_replace('%s', '', $arguments_tesseract); + $arguments_tesseract = str_replace_first('%file', '%s', $arguments_tesseract); $arguments_tesseract = sprintf($arguments_tesseract, $gs_destination_filename); - $command_gs = escapeshellcmd($execpath_gs.' '.$arguments_gs); - $command_tesseract = escapeshellcmd($execpath_tesseract.' '.$arguments_tesseract); + $command_gs = escapeshellcmd($execpath_gs . ' ' . $arguments_gs); + $command_tesseract = escapeshellcmd($execpath_tesseract . ' ' . $arguments_tesseract); - $command = $command_gs.' && '.$command_tesseract; + $command = $command_gs . ' && ' . $command_tesseract; - } else { + } + else { error_log("missing arguments for OCR"); } // Only return $command if it contains the original filepath somewhere - if (strpos($command, $file_path) !== false) { return $command;} + if (strpos($command, $file_path) !== FALSE) { + return $command; + } return ''; } @@ -309,42 +314,57 @@ protected function hOCRtoMiniOCR($output, $pageid) { error_log('Could not convert HOCR to MiniOCR, sources is not valid XML'); return NULL; } - $w = new \XMLWriter(); - $w->openMemory(); - $w->startDocument('1.0','UTF-8'); - $w->startElement("ocr"); + $miniocr = new \XMLWriter(); + $miniocr->openMemory(); + $miniocr->startDocument('1.0', 'UTF-8'); + $miniocr->startElement("ocr"); foreach ($hocr->body->children() as $page) { $coos = explode(" ", substr($page['title'], 5)); + // To avoid divisions by 0 + $pwidth = (float) $coos[2] ? (float) $coos[2] : 1; + $pheight = (float) $coos[3] ? (float) $coos[3] : 1; if (count($coos)) { - $w->startElement("p"); - $w->writeAttribute("id", $pageid); - $w->writeAttribute("wh", $coos[2] . " " . $coos[3]); - $w->startElement("b"); - foreach ($page->children() as $line) { - $w->startElement("l"); + $miniocr->startElement("p"); + $miniocr->writeAttribute("xml:id", $pageid); + $miniocr->writeAttribute("wh", $pwidth . " " . $pheight); + $miniocr->startElement("b"); + $page->registerXPathNamespace('ns', 'http://www.w3.org/1999/xhtml'); + foreach ($page->xpath('.//ns:span[@class="ocr_line"]') as $line) { + $miniocr->startElement("l"); foreach ($line->children() as $word) { $wcoos = explode(" ", $word['title']); if (count($wcoos)) { - $w->startElement("w"); - $w->writeAttribute("x", $wcoos[1] . ' ' . $wcoos[2] . ' ' . $wcoos[3] . ' ' . $wcoos[4]); - error_log($word->__toString()); - $w->text($word->__toString()); - $w->endElement(); + $x0 = (float) $wcoos[1]; + $y0 = (float) $wcoos[2]; + $x1 = (float) $wcoos[3]; + $y1 = (float) $wcoos[4]; + $l = round(($x0 / $pwidth), 3); + $t = round(($y0 / $pheight), 3); + $w = round((($x1 - $x0) / $pwidth), 3); + $h = round((($y1 - $y0) / $pheight), 3); + $text = (string) $word; + $miniocr->startElement("w"); + $miniocr->writeAttribute("x", $l . ' ' . $t . ' ' . $w . ' ' . $h); + + $miniocr->text($text); + $miniocr->endElement(); } } - $w->endElement(); + $miniocr->endElement(); } - $w->endElement(); - $w->endElement(); + $miniocr->endElement(); + $miniocr->endElement(); } } - $w->endElement(); - $w->endDocument(); + $miniocr->endElement(); + $miniocr->endDocument(); unset($hocr); - return $w->outputMemory(true); + + return $miniocr->outputMemory(TRUE); } +} + -} From 224c4403720c4b44c5aae80a9b33c862567e26c7 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 23 Nov 2020 11:25:52 -0500 Subject: [PATCH 08/24] Route Fix for D9 --- strawberry_runners.routing.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/strawberry_runners.routing.yml b/strawberry_runners.routing.yml index 3c0d81c..38af32f 100644 --- a/strawberry_runners.routing.yml +++ b/strawberry_runners.routing.yml @@ -23,8 +23,7 @@ strawberry_runners.ado_tools: requirements: _entity_access: 'node.update' options: - parameters: - node: - type: 'entity:node' - options: - _node_operation_route: TRUE \ No newline at end of file + _node_operation_route: TRUE + parameters: + node: + type: 'entity:node' From 617b64a14ac04f4e73eb00eb5c775f52bf2bd1b4 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 23 Nov 2020 12:49:36 -0500 Subject: [PATCH 09/24] Address page id question from giancarlo Move from // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} to pagenumber --- src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index 462f06f..ec6443f 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -210,6 +210,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; $pageid = $node_uuid . '/canvas/' . $file_uuid . '/p' . $page_number; + // $pageid not used for now, let's go with the page_number only setlocale(LC_CTYPE, 'en_US.UTF-8'); $execstring = $this->buildExecutableCommand($io); error_log($execstring); @@ -224,7 +225,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug throw new \Exception("Could not execute {$execstring} or timed out"); } - $miniocr = $this->hOCRtoMiniOCR($output, $pageid); + $miniocr = $this->hOCRtoMiniOCR($output, $page_number); error_log($miniocr); $io->output = $miniocr; } From f1073bd7cea7306d85386e4ae2f4247703c151c8 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 24 Nov 2020 00:28:57 -0500 Subject: [PATCH 10/24] Add search_api_solr to composer dependency --- composer.json | 1 + 1 file changed, 1 insertion(+) diff --git a/composer.json b/composer.json index a7e5c93..b9a9b0a 100644 --- a/composer.json +++ b/composer.json @@ -20,6 +20,7 @@ ], "require": { "ml/json-ld": "^1.0", + "drupal/search_api_solr": "~4.1", "mtdowling/jmespath.php": "^2.4", "strawberryfield/strawberryfield": "dev-1.0.0-RC1", "react/event-loop": "^1.1", From 0442676d6f8691ec280eb5dd05d95e45ab5d552a Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 24 Nov 2020 00:29:28 -0500 Subject: [PATCH 11/24] Correctly process page ratio and parse things out for miniCOR removed all data too --- .../OcrPostProcessor.php | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index ec6443f..7e2ad6d 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -209,8 +209,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} $page_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; - $pageid = $node_uuid . '/canvas/' . $file_uuid . '/p' . $page_number; - // $pageid not used for now, let's go with the page_number only + setlocale(LC_CTYPE, 'en_US.UTF-8'); $execstring = $this->buildExecutableCommand($io); error_log($execstring); @@ -306,7 +305,6 @@ public function buildExecutableCommand(\stdClass $io) { } protected function hOCRtoMiniOCR($output, $pageid) { - error_log($output); $hocr = simplexml_load_string($output); $internalErrors = libxml_use_internal_errors(TRUE); libxml_clear_errors(); @@ -320,7 +318,20 @@ protected function hOCRtoMiniOCR($output, $pageid) { $miniocr->startDocument('1.0', 'UTF-8'); $miniocr->startElement("ocr"); foreach ($hocr->body->children() as $page) { - $coos = explode(" ", substr($page['title'], 5)); + $titleparts = explode(';', $page['title']); + $pagetitle = NULL; + foreach ($titleparts as $titlepart) { + $titlepart = trim($titlepart); + if (strpos($titlepart, 'bbox') === 0 ) { + $pagetitle = substr($titlepart, 5); + } + } + if ($pagetitle == NULL) { + $miniocr->flush(); + error_log('Could not convert HOCR to MiniOCR, no valid page dimensions found'); + return NULL; + } + $coos = explode(" ", $pagetitle); // To avoid divisions by 0 $pwidth = (float) $coos[2] ? (float) $coos[2] : 1; $pheight = (float) $coos[3] ? (float) $coos[3] : 1; @@ -346,7 +357,6 @@ protected function hOCRtoMiniOCR($output, $pageid) { $text = (string) $word; $miniocr->startElement("w"); $miniocr->writeAttribute("x", $l . ' ' . $t . ' ' . $w . ' ' . $h); - $miniocr->text($text); $miniocr->endElement(); } @@ -360,7 +370,6 @@ protected function hOCRtoMiniOCR($output, $pageid) { $miniocr->endElement(); $miniocr->endDocument(); unset($hocr); - return $miniocr->outputMemory(TRUE); } From d201c755d12f1698938bebca31eb91594813427f Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 24 Nov 2020 00:38:44 -0500 Subject: [PATCH 12/24] Checks if Checksum + search_api_id are already in Solr If so, skipps processing. @giancarlobi to test this you need to reindex and make sure you have a checksum field for your strawberryfield_flavor_datasource data source in Solr. Then run this. Should fill the first time If you save the node again, second time/ infinite you should get a message saying its already in the index, Now go to raw edit and change on letter in your checksum. Run again. Should process again //@TODO clear old solr documents. Checksum is great. Also on reindex read from Node stored HOCR, etc. This is working well! --- .../IndexPostProcessorQueueWorker.php | 148 +++++++++++++++--- 1 file changed, 125 insertions(+), 23 deletions(-) diff --git a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php index 6d1c715..28c472a 100644 --- a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php @@ -14,19 +14,18 @@ use Drupal\Core\KeyValueStore\KeyValueFactoryInterface; use Drupal\Core\Plugin\ContainerFactoryPluginInterface; use Drupal\Core\Queue\QueueWorkerBase; -use Drupal\Component\Serialization\Json; use Drupal\Core\StreamWrapper\StreamWrapperInterface; use Drupal\Core\StreamWrapper\StreamWrapperManagerInterface; use Drupal\file\FileInterface; -use Drupal\search_api\Plugin\search_api\datasource\ContentEntity; -use Drupal\search_api_attachments\Plugin\search_api\processor\FilesExtractor; -use Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor; +use Drupal\search_api\Query\QueryInterface; use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface; use Psr\Log\LoggerInterface; use Psr\Log\LogLevel; use Symfony\Component\DependencyInjection\ContainerInterface; use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginManager; use Drupal\strawberryfield\Plugin\search_api\datasource\StrawberryfieldFlavorDatasource; +use Drupal\search_api\ParseMode\ParseModePluginManager; + /** * Process the JSON payload provided by the webhook. @@ -34,7 +33,7 @@ * @QueueWorker( * id = "strawberryrunners_process_index", * title = @Translation("Strawberry Runners Process to Index Queue Worker"), - * cron = {"time" = 5} + * cron = {"time" = 180} * ) */ class IndexPostProcessorQueueWorker extends QueueWorkerBase implements ContainerFactoryPluginInterface { @@ -79,6 +78,12 @@ class IndexPostProcessorQueueWorker extends QueueWorkerBase implements Container */ protected $logger; + /** + * The parse mode manager. + * + * @var \Drupal\search_api\ParseMode\ParseModePluginManager + */ + protected $parseModeManager; /** * Constructor. @@ -89,7 +94,7 @@ class IndexPostProcessorQueueWorker extends QueueWorkerBase implements Container * @param \Drupal\Core\Entity\EntityTypeManager $entity_field_manager * @param \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginManager $strawberry_runner_processor_plugin_manager */ - public function __construct(array $configuration, $plugin_id, $plugin_definition, EntityTypeManagerInterface $entity_type_manager, StrawberryRunnersPostProcessorPluginManager $strawberry_runner_processor_plugin_manager, FileSystemInterface $file_system, StreamWrapperManagerInterface $stream_wrapper_manager, KeyValueFactoryInterface $key_value, LoggerInterface $logger) { + public function __construct(array $configuration, $plugin_id, $plugin_definition, EntityTypeManagerInterface $entity_type_manager, StrawberryRunnersPostProcessorPluginManager $strawberry_runner_processor_plugin_manager, FileSystemInterface $file_system, StreamWrapperManagerInterface $stream_wrapper_manager, KeyValueFactoryInterface $key_value, LoggerInterface $logger, ParseModePluginManager $parse_mode_manager) { parent::__construct($configuration, $plugin_id, $plugin_definition); $this->entityTypeManager = $entity_type_manager; $this->strawberryRunnerProcessorPluginManager = $strawberry_runner_processor_plugin_manager; @@ -97,6 +102,7 @@ public function __construct(array $configuration, $plugin_id, $plugin_definition $this->streamWrapperManager = $stream_wrapper_manager; $this->keyValue = $key_value; $this->logger = $logger; + $this->parseModeManager = $parse_mode_manager; } /** @@ -119,7 +125,8 @@ public static function create(ContainerInterface $container, array $configuratio $container->get('file_system'), $container->get('stream_wrapper_manager'), $container->get('keyvalue'), - $container->get('logger.channel.strawberry_runners') + $container->get('logger.channel.strawberry_runners'), + $container->get('plugin.manager.search_api.parse_mode') ); } @@ -176,9 +183,14 @@ public function processItem($data) { } try { + // Get which indexes have our StrawberryfieldFlavorDatasource enabled! + $indexes = StrawberryfieldFlavorDatasource::getValidIndexes(); + $keyvalue_collection = 'Strawberryfield_flavor_datasource_temp'; $key = $keyvalue_collection . ':' . $file->uuid().':'.$data->plugin_config_entity_id; + + //We only deal with NODES. $entity = $this->entityTypeManager->getStorage('node') ->load($data->nid); @@ -186,14 +198,28 @@ public function processItem($data) { if(!$entity) { return; } + $item_ids = []; + $inindex = 1; + if (is_a($entity, TranslatableInterface::class)) { + $translations = $entity->getTranslationLanguages(); + foreach ($translations as $translation_id => $translation) { + //@TODO here, the number 1 needs to come from the sequence. + $item_id = $entity->id() . ':'.'1' .':'.$translation_id.':'.$file->uuid().':'.$data->plugin_config_entity_id; + // a single 0 as return will force us to reindex. + $inindex = $inindex * $this->flavorInSolrIndex($item_id, $data->metadata['checksum'], $indexes); + $item_ids[] = $item_id; + } + } + // Check if we already have this entry in Solr + if ($inindex !== 0) { + error_log('Already in search index, skipping'); + } // Skip file if element is found in key_value collection. $processed_data = $this->keyValue->get($keyvalue_collection)->get($key); - error_log('Is this already in our temp keyValue?'); - error_log(empty($processed_data)); //@TODO allow a force in case of corrupted key value? Partial output // Extragenous weird data? - if (true || empty($processed_data) || + if ($inindex === 0 || empty($processed_data) || $data->force == TRUE || (!isset($processed_data->checksum) || empty($processed_data->checksum) || @@ -219,19 +245,8 @@ public function processItem($data) { $toindex = new \stdClass(); $toindex->fulltext = $io->output; $toindex->checksum = $data->metadata['checksum']; - error_log(var_export($toindex,true)); - $this->keyValue->get($keyvalue_collection)->set($key, $toindex); - - // Get which indexes have our StrawberryfieldFlavorDatasource enabled! - $indexes = StrawberryfieldFlavorDatasource::getValidIndexes(); - $item_ids = []; - if (is_a($entity, TranslatableInterface::class)) { - $translations = $entity->getTranslationLanguages(); - foreach ($translations as $translation_id => $translation) { - $item_ids[] = $entity->id() . ':'.'1' .':'.$translation_id.':'.$file->uuid().':'.$data->plugin_config_entity_id; - } - } + $this->keyValue->get($keyvalue_collection)->set($key, $toindex); error_log(var_export($item_ids,true)); $datasource_id = 'strawberryfield_flavor_datasource'; foreach ($indexes as $index) { @@ -240,6 +255,15 @@ public function processItem($data) { } } catch (\Exception $exception) { + $message_params = [ + '@file_id' => $data->fid, + '@entity_id' => $data->nid, + '@message' => $exception->getMessage(), + ]; + if (!isset($data->extract_attempts)) { + $data->extract_attempts = 0; + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed with message: @message File id @file_id at Node @entity_id.', $message_params); + } if ($data->extract_attempts < 3) { $data->extract_attempts++; \Drupal::queue('strawberryrunners_process_index')->createItem($data); @@ -249,7 +273,7 @@ public function processItem($data) { '@file_id' => $data->fid, '@entity_id' => $data->nid, ]; - $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed after 3 attempts @file_id for @entity_type @entity_id.', $message_params); + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed after 3 attempts File Id @file_id at Node @entity_id.', $message_params); } } } @@ -324,4 +348,82 @@ public function getRealpath($uri) { } } + /** + * Checks Search API indexes for an Document ID and Checksum Match + * + * @param string $key + * @param string $checksum + * @param array $indexes + * + * @throws \Drupal\Component\Plugin\Exception\PluginException + * @throws \Drupal\search_api\SearchApiException + */ + public function flavorInSolrIndex(string $key, string $checksum, array $indexes): int { + /* @var \Drupal\search_api\IndexInterface[] $indexes */ + + $count = 0; + foreach ($indexes as $search_api_index) { + + // Create the query. + $query = $search_api_index->query([ + 'limit' => 1, + 'offset' => 0, + ]); + + /*$query->setFulltextFields([ + 'title', + 'body', + 'filename', + 'saa_field_file_document', + 'saa_field_file_news', + 'saa_field_file_page' + ]);*/ + //$parse_mode = $this->parseModeManager->createInstance('direct'); + $parse_mode = $this->parseModeManager->createInstance('terms'); + $query->setParseMode($parse_mode); + // $parse_mode->setConjunction('OR'); + // $query->keys($search); + $query->sort('search_api_relevance', 'DESC'); + + $query->addCondition('search_api_id', 'strawberryfield_flavor_datasource/'.$key) + ->addCondition('search_api_datasource', 'strawberryfield_flavor_datasource') + ->addCondition('checksum', $checksum); + //$query = $query->addCondition('ss_checksum', $checksum); + // If we allow processing here Drupal adds Content Access Check + // That does not match our Data Source \Drupal\search_api\Plugin\search_api\processor\ContentAccess + // we get this filter (see 2nd) + /* + * array ( + 0 => 'ss_search_api_id:"strawberryfield_flavor_datasource/2006:1:en:3dccdb09-f79f-478e-81c5-0bb680c3984e:ocr"', + 1 => 'ss_search_api_datasource:"strawberryfield_flavor_datasource"', + 2 => '{!tag=content_access,content_access_enabled,content_access_grants}(ss_search_api_datasource:"entity:file" (+(bs_status:"true" bs_status_2:"true") +(sm_node_grants:"node_access_all:0" sm_node_grants:"node_access__all")))', + 3 => '+index_id:default_solr_index +hash:1evb7z', + 4 => 'ss_search_api_language:("en" "und" "zxx")', + ), + */ + // Another solution would be to make our conditions all together an OR + // But no post processing here is also good, faster and we just want + // to know if its there or not. + $query->setProcessingLevel(QueryInterface::PROCESSING_NONE); + $results = $query->execute(); + + // $solr_response = $results->getExtraData('search_api_solr_response'); + // In case of more than one Index with the same Data Source we accumulate + $count = $count + (int) $results->getResultCount(); + + } + // This is a good one. If i have multiple indexes, but one is missing the i assume + // reprocessing is needed + // But if not, then i return 1, which means we have them all + // FUTURE thinking is the best. + $return = ($count == count($indexes)) ? 1 : 0; + return $return; + // Keys we need in the Search API + // - ss_search_api_id == $key + // A checksum field == Should be configurable? + // Let's start by naming it checksum? If not present we may trigger some Logger/alert? + // Or maybe we can use D8/D9 Status mechanic to let the user know this module + // needs it in the data flavor. + } + } From 2726210523077b4e49e837d15bf3b735959fe281 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 24 Nov 2020 09:14:39 -0500 Subject: [PATCH 13/24] Remove leading 0s from miniOCR dimensions @giancarlobi thanks! addressing the code review --- .../StrawberryRunnersPostProcessor/OcrPostProcessor.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index 7e2ad6d..1c80798 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -335,10 +335,11 @@ protected function hOCRtoMiniOCR($output, $pageid) { // To avoid divisions by 0 $pwidth = (float) $coos[2] ? (float) $coos[2] : 1; $pheight = (float) $coos[3] ? (float) $coos[3] : 1; + // NOTE: floats are in the form of .1 so we need to remove the first 0. if (count($coos)) { $miniocr->startElement("p"); $miniocr->writeAttribute("xml:id", $pageid); - $miniocr->writeAttribute("wh", $pwidth . " " . $pheight); + $miniocr->writeAttribute("wh", ltrim($pwidth, 0) . " " . ltrim($pheight, 0)); $miniocr->startElement("b"); $page->registerXPathNamespace('ns', 'http://www.w3.org/1999/xhtml'); foreach ($page->xpath('.//ns:span[@class="ocr_line"]') as $line) { @@ -356,7 +357,7 @@ protected function hOCRtoMiniOCR($output, $pageid) { $h = round((($y1 - $y0) / $pheight), 3); $text = (string) $word; $miniocr->startElement("w"); - $miniocr->writeAttribute("x", $l . ' ' . $t . ' ' . $w . ' ' . $h); + $miniocr->writeAttribute("x", ltrim($l, '0') . ' ' . ltrim($t, 0) . ' ' . ltrim($w, 0) . ' ' . ltrim($h, 0)); $miniocr->text($text); $miniocr->endElement(); } From eaf0a4730b7d204e26c72221d00022008df4cc2d Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 1 Dec 2020 11:47:43 -0500 Subject: [PATCH 14/24] Chained processors working @giancarlobi there is a lot of not needed code i can optimize here but it works. Will share my config! --- .gitignore | 1 + .../StrawberryRunnersPostProcessor.php | 7 +- .../IndexPostProcessorQueueWorker.php | 287 +++++++++++++----- .../JsonFileSequencePostProcessor.php | 162 ++++++++++ .../OcrPostProcessor.php | 15 +- .../SystemBinaryPostProcessor.php | 12 +- 6 files changed, 387 insertions(+), 97 deletions(-) create mode 100644 src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php diff --git a/.gitignore b/.gitignore index 5a8259d..8feb242 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ .idea/modules.xml .idea/misc.xml .idea/codeStyles/codeStyleConfig.xml +src/.DS_Store diff --git a/src/Annotation/StrawberryRunnersPostProcessor.php b/src/Annotation/StrawberryRunnersPostProcessor.php index 3b5a3bf..7a7b119 100644 --- a/src/Annotation/StrawberryRunnersPostProcessor.php +++ b/src/Annotation/StrawberryRunnersPostProcessor.php @@ -21,8 +21,7 @@ class StrawberryRunnersPostProcessor extends Plugin { const PRESAVE = 'preSave'; - const INDEX = 'search_api'; - + const POSTSAVE = 'postSave'; /** * The plugin id. @@ -64,11 +63,11 @@ class StrawberryRunnersPostProcessor extends Plugin { public $input_arguments; /** - * Processing stage: can be Entity PreSave or Index time search_api + * Processing stage: can be Entity PreSave or PostSave * * @var string $when; * */ - public $when = StrawberryRunnersPostProcessor::PRESAVE; + public $when = StrawberryRunnersPostProcessor::POSTSAVE; } diff --git a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php index 28c472a..0b9a6e0 100644 --- a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php @@ -159,18 +159,52 @@ protected function getProcessorPlugin($plugin_config_entity_id) { } + /** + * Gets all Children of the currently being processed Processor Plugin + * + * @param string $current_id + * + * @return array + */ + private function getChildProcessorIds(string $plugin_config_entity_id):array { + /* @var $plugin_config_entities \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity[] */ + $plugin_config_entities = $this->entityTypeManager->getListBuilder('strawberry_runners_postprocessor') + ->load(); + $active_plugins = []; + // This kids should be cached; + // We basically want here what type of processor this is and its input_argument and input_options + $plugin_definitions = $this->strawberryRunnerProcessorPluginManager->getDefinitions(); + + error_log('getting child processors'); + foreach ($plugin_config_entities as $plugin_config_entity) { + // Only get first level (no Parents) and Active ones. + if ($plugin_config_entity->isActive() && $plugin_config_entity->getParent() == $plugin_config_entity_id) { + $active_plugins[] = [ + 'config_entity' => $plugin_config_entity, + 'plugin_definition' => $plugin_definitions[$plugin_config_entity->getPluginid()] + ]; + } + } + return $active_plugins; + } + /** * {@inheritdoc} */ public function processItem($data) { $processor_instance = $this->getProcessorPlugin($data->plugin_config_entity_id); + // Read the Input Properties and Input Arguments of the current Processor + + // Now check if there are any child? + error_log('config for this processor'); + $processor_config = $processor_instance->getConfiguration(); if (!isset($data->fid) || $data->fid == NULL || !isset($data->nid) || $data->nid == NULL || !is_array($data->metadata)) { return; } $file = $this->entityTypeManager->getStorage('file')->load($data->fid); - + // 0 byte files have checksum, check what it is! if ($file === NULL || !isset($data->metadata['checksum'])) { error_log('Sorry the file does not exist or has no checksum yet. We really need the checksum'); return; @@ -181,101 +215,157 @@ public function processItem($data) { if ($filelocation === NULL) { return; } + // Means we could pass also a file directly anytime + $data->filelocation = $filelocation; - try { - // Get which indexes have our StrawberryfieldFlavorDatasource enabled! - $indexes = StrawberryfieldFlavorDatasource::getValidIndexes(); + if (isset($processor_config['output_destination']['searchapi']) && $processor_config['output_destination']['searchapi'] == 'searchapi') { + $tobeindexed = TRUE; + } - $keyvalue_collection = 'Strawberryfield_flavor_datasource_temp'; - $key = $keyvalue_collection . ':' . $file->uuid().':'.$data->plugin_config_entity_id; + // Only applies to those that will be indexed + if ($tobeindexed) { + try { + // Get which indexes have our StrawberryfieldFlavorDatasource enabled! + $indexes = StrawberryfieldFlavorDatasource::getValidIndexes(); + $keyvalue_collection = 'Strawberryfield_flavor_datasource_temp'; + // This can repeat/overwrite, lacks the sequence ID. + $key = $keyvalue_collection . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id; - //We only deal with NODES. - $entity = $this->entityTypeManager->getStorage('node') - ->load($data->nid); + //We only deal with NODES. + $entity = $this->entityTypeManager->getStorage('node') + ->load($data->nid); - if(!$entity) { - return; - } - $item_ids = []; - $inindex = 1; - if (is_a($entity, TranslatableInterface::class)) { - $translations = $entity->getTranslationLanguages(); - foreach ($translations as $translation_id => $translation) { - //@TODO here, the number 1 needs to come from the sequence. - $item_id = $entity->id() . ':'.'1' .':'.$translation_id.':'.$file->uuid().':'.$data->plugin_config_entity_id; - // a single 0 as return will force us to reindex. - $inindex = $inindex * $this->flavorInSolrIndex($item_id, $data->metadata['checksum'], $indexes); - $item_ids[] = $item_id; + if (!$entity) { + return; } - } - // Check if we already have this entry in Solr - if ($inindex !== 0) { - error_log('Already in search index, skipping'); - } - // Skip file if element is found in key_value collection. - $processed_data = $this->keyValue->get($keyvalue_collection)->get($key); - //@TODO allow a force in case of corrupted key value? Partial output - // Extragenous weird data? - if ($inindex === 0 || empty($processed_data) || - $data->force == TRUE || - (!isset($processed_data->checksum) || - empty($processed_data->checksum) || - $processed_data->checksum != $data->metadata['checksum'])) { - // Extract file and save it in key_value collection. - $io = new \stdClass(); - $input = new \stdClass(); - $input->filepath = $filelocation; - $input->page_number = 1; - // The Node UUID - $input->nuuid = $data->nuuid; - // All the rest of the associated Metadata in an as:structure - $input->metadata = $data->metadata; - $io->input = $input; - $io->output = NULL; - //@TODO implement the TEST and BENCHMARK logic here - // RUN should return exit codes so we can know if something failed - // And totally discard indexing. - $extracted_data = $processor_instance->run($io, StrawberryRunnersPostProcessorPluginInterface::PROCESS); - error_log ('processing just run'); - error_log('writing to keyvalue'); - error_log($key); - $toindex = new \stdClass(); - $toindex->fulltext = $io->output; - $toindex->checksum = $data->metadata['checksum']; - - $this->keyValue->get($keyvalue_collection)->set($key, $toindex); - error_log(var_export($item_ids,true)); - $datasource_id = 'strawberryfield_flavor_datasource'; - foreach ($indexes as $index) { - $index->trackItemsInserted($datasource_id, $item_ids); + $item_ids = []; + $inindex = 1; + $input_property = $processor_instance->getPluginDefinition()['input_property']; + $input_argument = $processor_instance->getPluginDefinition()['input_argument']; + + // @TODO If argument is not here, do we return?? + $data->{$input_argument} = isset($data->{$input_argument}) ? $data->{$input_argument} : 1; + + if (is_a($entity, TranslatableInterface::class)) { + $translations = $entity->getTranslationLanguages(); + foreach ($translations as $translation_id => $translation) { + //@TODO here, the number 1 needs to come from the sequence. + $item_id = $entity->id() . ':' . $data->{$input_argument} . ':' . $translation_id . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id; + // a single 0 as return will force us to reindex. + $inindex = $inindex * $this->flavorInSolrIndex($item_id, $data->metadata['checksum'], $indexes); + $item_ids[] = $item_id; + } } - } - } - catch (\Exception $exception) { - $message_params = [ - '@file_id' => $data->fid, - '@entity_id' => $data->nid, - '@message' => $exception->getMessage(), - ]; - if (!isset($data->extract_attempts)) { - $data->extract_attempts = 0; - $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed with message: @message File id @file_id at Node @entity_id.', $message_params); - } - if ($data->extract_attempts < 3) { - $data->extract_attempts++; - \Drupal::queue('strawberryrunners_process_index')->createItem($data); - } - else { + + // Check if we already have this entry in Solr + if ($inindex !== 0) { + error_log('Already in search index, skipping'); + } + // Skip file if element is found in key_value collection. + $processed_data = $this->keyValue->get($keyvalue_collection)->get($key); + //@TODO allow a force in case of corrupted key value? Partial output + // Extragenous weird data? + if ($tobeindexed && ($inindex === 0 || empty($processed_data) || + $data->force == TRUE || + (!isset($processed_data->checksum) || + empty($processed_data->checksum) || + $processed_data->checksum != $data->metadata['checksum']))) { + // Extract file and save it in key_value collection. + $io = $this->invokeProcessor($processor_instance, $data); + error_log('processing just run'); + error_log('writing to keyvalue'); + error_log($key); + // Check if $io->output exists? + $toindex = new \stdClass(); + $toindex->fulltext = $io->output->searchapi; + $toindex->checksum = $data->metadata['checksum']; + + $this->keyValue->get($keyvalue_collection)->set($key, $toindex); + error_log(var_export($item_ids, TRUE)); + $datasource_id = 'strawberryfield_flavor_datasource'; + foreach ($indexes as $index) { + $index->trackItemsInserted($datasource_id, $item_ids); + } + } + } catch (\Exception $exception) { $message_params = [ '@file_id' => $data->fid, '@entity_id' => $data->nid, + '@message' => $exception->getMessage(), ]; - $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed after 3 attempts File Id @file_id at Node @entity_id.', $message_params); + if (!isset($data->extract_attempts)) { + $data->extract_attempts = 0; + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed with message: @message File id @file_id at Node @entity_id.', $message_params); + } + if ($data->extract_attempts < 3) { + $data->extract_attempts++; + \Drupal::queue('strawberryrunners_process_index')->createItem($data); + } + else { + $message_params = [ + '@file_id' => $data->fid, + '@entity_id' => $data->nid, + ]; + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed after 3 attempts File Id @file_id at Node @entity_id.', $message_params); + } + } + } + else { + // This will not + $io = $this->invokeProcessor($processor_instance, $data); + error_log('we do not need to index this'); + error_log(var_export($io, true)); + error_log('we do not need to index this'); + } + + if (isset($io->output->plugin) && !empty($io->output->plugin)) { + error_log('Time to check on children'); + error_log($data->plugin_config_entity_id); + $childprocessors = $this->getChildProcessorIds($data->plugin_config_entity_id); + error_log(print_r($childprocessors,true)); + foreach($childprocessors as $plugin_info) { + $childdata = clone $data; // So we do not touch original data + /* @var $strawberry_runners_postprocessor_config \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity */ + $postprocessor_config_entity = $plugin_info['config_entity']; + $postprocessor_plugin_definition = $plugin_info['plugin_definition']; + $input_property = $plugin_info['plugin_definition']['input_property']; + $input_argument = $plugin_info['plugin_definition']['input_argument']; + //@TODO check if this are here and not null! + // $io->ouput will contain whatever the output is + // We will check if the child processor + // contains a property contained in $output + // If so we check if there is a single value or multiple ones + // For each we enqueue a child using that property in its data + + // Possible input properties: + // - Can come from the original Data (most likely) + // - May be overriden by the $io->output, e.g when a processor generates a file that is not part of any node + $input_property_value = isset($io->output->plugin) && isset($io->output->plugin[$input_property]) ? $io->output->plugin[$input_property] : $data->{$input_property}; + // Warning Diego. This may lead to a null + $childdata->{$input_property} = $input_property_value; + $childdata->plugin_config_entity_id = $postprocessor_config_entity->id(); + $input_argument_value = isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ? $io->output->plugin[$input_argument] : $data->{$input_argument}; + error_log(print_r($input_argument_value,true)); + if (is_array($input_argument_value)) { + foreach ($input_argument_value as $value) { + // Here is the catch. + // Output properties may be many + // Input Properties matching always need to be one + if (!is_array($value)) { + $childdata->{$input_argument} = $value; + error_log("should add to queue {$childdata->plugin_config_entity_id}"); + error_log(var_export($childdata,true)); + \Drupal::queue('strawberryrunners_process_index') + ->createItem($childdata); + } + } + } } } + } /** @@ -313,7 +403,6 @@ private function ensureFileAvailability(FileInterface $file) { ); } - if (!$templocation) { $this->loggerFactory->get('strawberry_runners')->warning( 'Could not adquire a local accessible location for text extraction for file with URL @fileurl', @@ -348,6 +437,39 @@ public function getRealpath($uri) { } } + /** + * This method actually invokes the processor. + * + * @param StrawberryRunnersPostProcessorPluginInterface $processor_instance + * @param \stdClass $data + * + * @return \stdClass + */ + private function invokeProcessor(StrawberryRunnersPostProcessorPluginInterface $processor_instance, \stdClass $data): \stdClass { + + $input_property = $processor_instance->getPluginDefinition()['input_property']; + $input_argument = $processor_instance->getPluginDefinition()['input_argument']; + + $io = new \stdClass(); + $input = new \stdClass(); + + // @NOTE: this is the only place where we just pass filelocation fixed instead of the + // actual property named $input_property. Which may be weird? + $input->{$input_property} = $data->filelocation; + $input->{$input_argument} = isset($data->{$input_argument}) ? $data->{$input_argument} : 1; + // The Node UUID + $input->nuuid = $data->nuuid; + // All the rest of the associated Metadata in an as:structure + $input->metadata = $data->metadata; + $io->input = $input; + $io->output = NULL; + //@TODO implement the TEST and BENCHMARK logic here + // RUN should return exit codes so we can know if something failed + // And totally discard indexing. + $extracted_data = $processor_instance->run($io, StrawberryRunnersPostProcessorPluginInterface::PROCESS); + return $io; + } + /** * Checks Search API indexes for an Document ID and Checksum Match * @@ -425,5 +547,4 @@ public function flavorInSolrIndex(string $key, string $checksum, array $indexes) // Or maybe we can use D8/D9 Status mechanic to let the user know this module // needs it in the data flavor. } - } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php new file mode 100644 index 0000000..d035628 --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php @@ -0,0 +1,162 @@ + 'asstructure', + 'mime_type' => ['application/pdf'], + 'output_type' => 'json', + 'output_destination' => 'plugin', + ] + parent::defaultConfiguration(); + } + + + public function calculateDependencies() { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + // @TODO: Implement calculateDependencies() method. + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + + $element['source_type'] = [ + '#type' => 'hidden', + '#title' => $this->t('The type of source data this processor works on'), + '#default_value' => $this->getConfiguration()['source_type'], + ]; + + $element['ado_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('ADO type(s) to limit this processor to.'), + '#default_value' => $this->getConfiguration()['ado_type'], + '#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'), + ]; + + $element['jsonkey'] = [ + '#type' => 'checkboxes', + '#title' => $this->t('The JSON key that contains the desired source files.'), + '#options' => [ + 'as:image' => 'as:image', + 'as:document' => 'as:document', + 'as:audio' => 'as:audio', + 'as:video' => 'as:video', + 'as:text' => 'as:text', + 'as:application' => 'as:application', + ], + '#default_value' => (!empty($this->getConfiguration()['jsonkey']) && is_array($this->getConfiguration()['jsonkey'])) ? $this->getConfiguration()['jsonkey'] : [], + '#required' => TRUE, + ]; + + $element['mime_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('Mimetypes(s) to limit this Processor to.'), + '#default_value' => $this->getConfiguration()['mime_type'], + '#description' => $this->t('A single Mimetype type or a coma separed list of mimetypes that qualify to be Processed. Leave empty to apply any file'), + ]; + + $element['timeout'] = [ + '#type' => 'number', + '#title' => $this->t('Timeout in seconds for this process.'), + '#default_value' => $this->getConfiguration()['timeout'], + '#description' => $this->t('If the process runs out of time it can still be processed again.'), + '#size' => 2, + '#maxlength' => 2, + '#min' => 1, + ]; + $element['weight'] = [ + '#type' => 'number', + '#title' => $this->t('Order or execution in the global chain.'), + '#default_value' => $this->getConfiguration()['weight'], + ]; + + return $element; + } + + + public function onDependencyRemoval(array $dependencies) { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + return parent::onDependencyRemoval( + $dependencies + ); // TODO: Change the autogenerated stub + } + + /** + * Executes the logic of this plugin given a file path and a context. + * + * @param \stdClass $io + * $io->input needs to contain + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments + * $io->output will contain the result of the processor + * @param string $context + */ + public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS) { + // Specific input key as defined in the annotation + // In this case it will contain an absolute Path to a File. + // Needed since this executes locally on the server via SHELL. + + $input_property = $this->pluginDefinition['input_property']; + $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL; + $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; + $config = $this->getConfiguration(); + error_log('Get File Sequence'); + $page_number = []; + if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { + // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} + $io->output = $io->input; + // Now check if there is an "flv:identify" and iterate over each one. + if (isset($io->input->metadata['flv:identify']) && count($io->input->metadata['flv:identify']) > 0) { + foreach ($io->input->metadata['flv:identify'] as $key => $sequence) { + $page_number[] = $key; + } + } + // At least give it one page. (Should we?) + if (empty($page_number)) { + $page_numbers[] = 1; + } + + $output = new \stdClass(); + $output->plugin = ['page_number' => $page_number]; + $io->output = $output; + } + else { + \throwException(new \InvalidArgumentException); + } + } +} + + + + diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index 1c80798..10628ca 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -147,9 +147,8 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#type' => 'checkboxes', '#title' => $this->t("Where and how the output will be used."), '#options' => [ - 'subkey' => 'In the same Source Metadata, as a child structure of each Processed file', - 'ownkey' => 'In the same Source Metadata but inside its own, top level, "as:flavour" subkey based on the given machine name of the current plugin', 'plugin' => 'As Input for another processor Plugin', + 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)' ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), @@ -219,14 +218,18 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug // Support UTF-8 commands. // @see http://www.php.net/manual/en/function.shell-exec.php#85095 shell_exec("LANG=en_US.utf-8"); - $output = $this->proc_execute($execstring, $timeout); - if (is_null($output)) { + $proc_output = $this->proc_execute($execstring, $timeout); + if (is_null($proc_output)) { throw new \Exception("Could not execute {$execstring} or timed out"); } - $miniocr = $this->hOCRtoMiniOCR($output, $page_number); + $miniocr = $this->hOCRtoMiniOCR($proc_output, $page_number); error_log($miniocr); - $io->output = $miniocr; + $output = new \stdClass(); + $output->searchapi = $miniocr; + $output->plugin = $miniocr; + $io->output = $output; + } } else { diff --git a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php index 266199e..f3c8e5f 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php @@ -130,6 +130,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { 'subkey' => 'In the same Source Metadata, as a child structure of each Processed file', 'ownkey' => 'In the same Source Metadata but inside its own, top level, "as:flavour" subkey based on the given machine name of the current plugin', 'plugin' => 'As Input for another processor Plugin', + 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)' ], '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination']))? $this->getConfiguration()['output_destination']: [], '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), @@ -196,12 +197,15 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug // Support UTF-8 commands. // @see http://www.php.net/manual/en/function.shell-exec.php#85095 shell_exec("LANG=en_US.utf-8"); - //$output = shell_exec($execstring); - $output = $this->proc_execute($execstring, $timeout); - if (is_null($output)) { + $proc_output = $this->proc_execute($execstring, $timeout); + if (is_null($proc_output)) { throw new \Exception("Could not execute {$execstring} or timed out"); } - $io->output = $output; + $output = new \stdClass(); + $output->searchapi = $proc_output; + $output->plugin = $proc_output; + $io->output = $output; + } } else { \throwException(new \InvalidArgumentException); From 5c0d6889cc09df02634ec429a68fdab8788e196b Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 1 Dec 2020 15:36:40 -0500 Subject: [PATCH 15/24] Fixed generic key store key, now all pages are actually different @giancarlobi thanks for finding the BUG! Now its fixed. I update SBF ISSUE-112 too to match this. Tested quickly and all pages are different Running the same again now also works correctly, no reindex. Thanks!! --- .../IndexPostProcessorQueueWorker.php | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php index 0b9a6e0..1d9cb23 100644 --- a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php @@ -228,10 +228,7 @@ public function processItem($data) { try { // Get which indexes have our StrawberryfieldFlavorDatasource enabled! $indexes = StrawberryfieldFlavorDatasource::getValidIndexes(); - $keyvalue_collection = 'Strawberryfield_flavor_datasource_temp'; - // This can repeat/overwrite, lacks the sequence ID. - $key = $keyvalue_collection . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id; //We only deal with NODES. $entity = $this->entityTypeManager->getStorage('node') @@ -264,29 +261,41 @@ public function processItem($data) { if ($inindex !== 0) { error_log('Already in search index, skipping'); } - // Skip file if element is found in key_value collection. - $processed_data = $this->keyValue->get($keyvalue_collection)->get($key); + $inkeystore = TRUE; + // Skip file if element for every language is found in key_value collection. + foreach($item_ids as $item_id) { + $processed_data = $this->keyValue->get($keyvalue_collection) + ->get($item_id); + if (empty($processed_data) || !isset($processed_data->checksum) || + empty($processed_data->checksum) || + $processed_data->checksum != $data->metadata['checksum']) { + $inkeystore = $inkeystore && FALSE; + } + } //@TODO allow a force in case of corrupted key value? Partial output // Extragenous weird data? - if ($tobeindexed && ($inindex === 0 || empty($processed_data) || - $data->force == TRUE || - (!isset($processed_data->checksum) || - empty($processed_data->checksum) || - $processed_data->checksum != $data->metadata['checksum']))) { + if ($tobeindexed && ($inindex === 0 || $inkeystore === FALSE) || + $data->force == TRUE) { // Extract file and save it in key_value collection. $io = $this->invokeProcessor($processor_instance, $data); - error_log('processing just run'); - error_log('writing to keyvalue'); - error_log($key); + // Check if $io->output exists? $toindex = new \stdClass(); $toindex->fulltext = $io->output->searchapi; $toindex->checksum = $data->metadata['checksum']; - $this->keyValue->get($keyvalue_collection)->set($key, $toindex); - error_log(var_export($item_ids, TRUE)); $datasource_id = 'strawberryfield_flavor_datasource'; foreach ($indexes as $index) { + // For each language we do this + // Eventually we will want to have different outputs per language? + // But maybe not for HOCR. since the doc will be the same. + foreach($item_ids as $item_id) { + error_log('processing just run'); + error_log('writing to keyvalue'); + error_log($item_id); + $this->keyValue->get($keyvalue_collection) + ->set($item_id, $toindex); + } $index->trackItemsInserted($datasource_id, $item_ids); } } From f6247d7f9ac499703a435ebb53f747dbc6a1a742 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 4 Dec 2020 01:04:37 -0500 Subject: [PATCH 16/24] Basically a LOT: For now 2 events subs, share the same code so i may want to... ... make this a service with a method i can invoke and share. Simpler. --- ...ersEventInsertPostProcessingSubscriber.php | 331 ++++++++++++++++++ ...nnersEventSavePostProcessingSubscriber.php | 78 +---- strawberry_runners.services.yml | 10 +- 3 files changed, 359 insertions(+), 60 deletions(-) create mode 100644 src/EventSubscriber/StrawberryRunnersEventInsertPostProcessingSubscriber.php diff --git a/src/EventSubscriber/StrawberryRunnersEventInsertPostProcessingSubscriber.php b/src/EventSubscriber/StrawberryRunnersEventInsertPostProcessingSubscriber.php new file mode 100644 index 0000000..b72a173 --- /dev/null +++ b/src/EventSubscriber/StrawberryRunnersEventInsertPostProcessingSubscriber.php @@ -0,0 +1,331 @@ +stringTranslation = $string_translation; + $this->messenger = $messenger; + $this->loggerFactory = $logger_factory; + $this->configFactory = $config_factory; + $this->streamWrapperManager = $stream_wrapper_manager; + $this->fileSystem = $file_system; + $this->entityTypeManager = $entity_type_manager; + $this->strawberryRunnerProcessorPluginManager = $strawberry_runner_processor_plugin_manager; + $this->account = $account; + } + + /** + * Method called when Event occurs. + * + * @param \Drupal\strawberryfield\Event\StrawberryfieldCrudEvent $event + * + * @throws \Drupal\Component\Plugin\Exception\InvalidPluginDefinitionException + * @throws \Drupal\Component\Plugin\Exception\PluginException + * @throws \Drupal\Component\Plugin\Exception\PluginNotFoundException + */ + public function onEntityInsert(StrawberryfieldCrudEvent $event) { + + /* @var $plugin_config_entities \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity[] */ + $plugin_config_entities = $this->entityTypeManager->getListBuilder('strawberry_runners_postprocessor')->load(); + $active_plugins = []; + foreach($plugin_config_entities as $plugin_config_entity) { + // Only get first level (no Parents) and Active ones. + if ($plugin_config_entity->isActive() && $plugin_config_entity->getParent() == '') { + $entity_id = $plugin_config_entity->id(); + $configuration_options = $plugin_config_entity->getPluginconfig(); + $configuration_options['configEntity'] = $entity_id; + /* @var \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface $plugin_instance */ + $plugin_instance = $this->strawberryRunnerProcessorPluginManager->createInstance( + $plugin_config_entity->getPluginid(), + $configuration_options + ); + $plugin_definition = $plugin_instance->getPluginDefinition(); + // We don't use the key here to preserve the original weight given order + // Classify by input type + + $active_plugins[$plugin_definition['input_type']][$entity_id] = $plugin_instance->getConfiguration(); + } + } + + // We will fetch all files and then see if each file can be processed by one + // or more plugin. + // Slower option would be to traverse every file per processor. + + $entity = $event->getEntity(); + $sbf_fields = $event->getFields(); + + // First pass: for files, all the as:structures we want for, keyed by content type + /* check your config + "source_type" => "asstructure" + "ado_type" => "Document" + "jsonkey" => array:6 [â–¼ + "as:document" => "as:document" + "as:image" => 0 + "as:audio" => 0 + "as:video" => 0 + "as:text" => 0 + "as:application" => 0 + ] + "mime_type" => "application/pdf" + "path" => "/usr/bin/pdftotext" + "arguments" => "%file" + "output_type" => "json" + "output_destination" => array:3 [â–¼ + "plugin" => "plugin" + "subkey" => 0 + "ownkey" => 0 + ] + "timeout" => "10" + "weight" => "0" + "configEntity" => "test" + ]*/ + + if (isset($active_plugins['entity:file'])) { + foreach($active_plugins['entity:file'] as $activePluginId => $config) { + if ($config['source_type'] == 'asstructure') { + $askeys = array_filter($config['jsonkey']); + foreach($askeys as $key => $value) { + $askeymap[$key][$activePluginId] = $config; + } + } + } + } + + foreach ($sbf_fields as $field_name) { + /* @var $field \Drupal\Core\Field\FieldItemInterface */ + $field = $entity->get($field_name); + if (!$field->isEmpty()) { + $entity = $field->getEntity(); + $entity_type_id = $entity->getEntityTypeId(); + /** @var $field \Drupal\Core\Field\FieldItemList */ + foreach ($field->getIterator() as $delta => $itemfield) { + // Note: we are not touching the metadata here. + /** @var $itemfield \Drupal\strawberryfield\Plugin\Field\FieldType\StrawberryFieldItem */ + $flatvalues = (array) $itemfield->provideFlatten(); + // Run first on entity:files + $sbf_type = []; + if (isset($flatvalues['type'])) { + $sbf_type = (array) $flatvalues['type']; + } + foreach ($askeymap as $jsonkey => $activePlugins) { + if (isset($flatvalues[$jsonkey])) { + foreach ($flatvalues[$jsonkey] as $uniqueid => $asstructure) { + if (isset($asstructure['dr:fid']) && is_numeric($asstructure['dr:fid'])) { + foreach ($activePlugins as $activePluginId => $config) { + // Never ever run a processor over its own creation + if ($asstructure["dr:for"] == 'flv:'.$activePluginId) { + error_log('skipping '. $asstructure['dr:fid']); + continue; + } + + $valid_mimes = []; + //@TODO also split $config['ado_type'] so we can check + $valid_ado_type = []; + $valid_ado_type = explode(',', $config['ado_type']); + if (empty($config['ado_type']) || count(array_intersect($valid_ado_type, $sbf_type)) > 0) { + $valid_mimes = explode(',', $config['mime_type']); + if ( + (!isset($asstructure['flv:' . $activePluginId]) || empty($asstructure['flv:' . $activePluginId])) && + ( + empty($valid_mimes) || + (isset($asstructure["dr:mimetype"]) && in_array($asstructure["dr:mimetype"], $valid_mimes)) + ) + ) { + $data = new \stdClass(); + $data->fid = $asstructure['dr:fid']; + $data->nid = $entity->id(); + $data->asstructure_uniqueid = $uniqueid; + $data->asstructure_key = $jsonkey; + $data->nuuid = $entity->uuid(); + $data->field_name = $field_name; + $data->field_delta = $delta; + + // We are passing also the full file metadata. + // This gives us an advantage so we can reuse + // Sequence IDs, PDF pages, etc and act on them + // @TODO. We may want to have also Kill switches in the + // main metadata to act on this + // E.g flv:processor[$activePluginId] = FALSE? + // Also. Do we want to act on metadata and mark + // Files as already send for processing by a certain + // $activePluginId? That would allow us to skip reprocessing + // Easier? + $data->metadata = $asstructure; + + // @TODO how to force? + // Can be a state key, valuekey, or a JSON passed property. + // Issue with JSON passed property is that we can no longer + // Here modify it (Entity is saved) + // So we should really better have a non Metadata method for this + // Or/ we can have a preSave Subscriber that reads the prop, + // sets the state and then removes if before saving + + $data->force = FALSE; + $data->plugin_config_entity_id = $activePluginId; + // See https://github.com/esmero/strawberry_runners/issues/10 + // Since the destination Queue can be a modal thing + // And really what defines is the type of worker we want + // But all at the end will eventually feed the ::run() method + // We want to make this a full blown service. + \Drupal::queue('strawberryrunners_process_index') + ->createItem($data); + } + } + } + } + } + } + } + } + } + } + $current_class = get_called_class(); + $event->setProcessedBy($current_class, TRUE); + if ($this->account->hasPermission('display strawberry messages')) { + $this->messenger->addStatus(t('Post processor was invoked')); + } + + } + + /** + * Make sure no HTML or Javascript will be passed around. + * + * @param string $string + * A value returned by a processor + * + * @return string + * The value sanitized. + */ + private function sanitizeValue($string) { + if (!Unicode::validateUtf8($string)) { + $string = Html::escape(utf8_encode($string)); + } + return $string; + } +} diff --git a/src/EventSubscriber/StrawberryRunnersEventSavePostProcessingSubscriber.php b/src/EventSubscriber/StrawberryRunnersEventSavePostProcessingSubscriber.php index baa2962..a517eb1 100644 --- a/src/EventSubscriber/StrawberryRunnersEventSavePostProcessingSubscriber.php +++ b/src/EventSubscriber/StrawberryRunnersEventSavePostProcessingSubscriber.php @@ -177,11 +177,9 @@ public function onEntitySave(StrawberryfieldCrudEvent $event) { // or more plugin. // Slower option would be to traverse every file per processor. - $entity = $event->getEntity(); $sbf_fields = $event->getFields(); - // First pass: for files, all the as:structures we want for, keyed by content type /* check your config "source_type" => "asstructure" @@ -239,19 +237,35 @@ public function onEntitySave(StrawberryfieldCrudEvent $event) { if (isset($flatvalues[$jsonkey])) { foreach ($flatvalues[$jsonkey] as $uniqueid => $asstructure) { if (isset($asstructure['dr:fid']) && is_numeric($asstructure['dr:fid'])) { + foreach ($activePlugins as $activePluginId => $config) { + // Never ever run a processor over its own creation + if ($asstructure["dr:for"] == 'flv:'.$activePluginId) { + error_log('skipping '. $asstructure['dr:fid']); + continue; + } - foreach($activePlugins as $activePluginId => $config) { $valid_mimes = []; //@TODO also split $config['ado_type'] so we can check $valid_ado_type = []; $valid_ado_type = explode(',', $config['ado_type']); - if (empty($config['ado_type']) || count(array_intersect($valid_ado_type , $sbf_type)) > 0) { + if (empty($config['ado_type']) || count(array_intersect($valid_ado_type, $sbf_type)) > 0) { $valid_mimes = explode(',', $config['mime_type']); - if (empty($valid_mimes) || (isset($asstructure["dr:mimetype"]) && in_array($asstructure["dr:mimetype"], $valid_mimes))) { + if ( + (!isset($asstructure['flv:' . $activePluginId]) || empty($asstructure['flv:' . $activePluginId])) && + ( + empty($valid_mimes) || + (isset($asstructure["dr:mimetype"]) && in_array($asstructure["dr:mimetype"], $valid_mimes)) + ) + ) { $data = new \stdClass(); $data->fid = $asstructure['dr:fid']; $data->nid = $entity->id(); + $data->asstructure_uniqueid = $uniqueid; + $data->asstructure_key = $jsonkey; $data->nuuid = $entity->uuid(); + $data->field_name = $field_name; + $data->field_delta = $delta; + // We are passing also the full file metadata. // This gives us an advantage so we can reuse // Sequence IDs, PDF pages, etc and act on them @@ -299,7 +313,6 @@ public function onEntitySave(StrawberryfieldCrudEvent $event) { } - /** * Make sure no HTML or Javascript will be passed around. * @@ -315,57 +328,4 @@ private function sanitizeValue($string) { } return $string; } - - /** - * Adds File usage to DB for temp files used by SB Runners. - * - * This differs from how we count managed files in other places like SBF. - * Every Post Processor that needs the file will add a count - * Once done, will remove one. File will become unused when everyone releases it. - * - * - * @param \Drupal\file\FileInterface $file - * @param int $nodeid - */ - protected function add_file_usage(FileInterface $file, int $nodeid, string $entity_type_id = 'node') { - if (!$file || !$this->moduleHandler->moduleExists('file')) { - return; - } - /** @var \Drupal\file\FileUsage\FileUsageInterface $file_usage */ - - if ($file) { - $this->fileUsage->add($file, 'strawberry_runners', $entity_type_id, $nodeid); - } - } - - /** - * Deletes File usage from DB for temp files used by SB Runners. - * - * @param \Drupal\file\FileInterface $file - * @param int $nodeid - * @param int $count - * If count is 0 it will remove all references. - */ - protected function remove_file_usage( - FileInterface $file, - int $nodeid, - string $entity_type_id = 'node', - $count = 1 - ) { - if (!$file || !$this->moduleHandler->moduleExists('file')) { - return; - } - /** @var \Drupal\file\FileUsage\FileUsageInterface $file_usage */ - - if ($file) { - $this->fileUsage->delete( - $file, - 'strawberry_runners', - $entity_type_id, - $nodeid, - $count - ); - } - } - } diff --git a/strawberry_runners.services.yml b/strawberry_runners.services.yml index 7a7b337..1ff4303 100644 --- a/strawberry_runners.services.yml +++ b/strawberry_runners.services.yml @@ -2,12 +2,20 @@ services: strawberry_runner.processor_manager: class: Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginManager parent: default_plugin_manager - strawberry_runner.postprocessing_subscriber: + + strawberry_runner.postprocessing_save_subscriber: class: Drupal\strawberry_runners\EventSubscriber\StrawberryRunnersEventSavePostProcessingSubscriber tags: - {name: event_subscriber} arguments: ['@string_translation', '@messenger', '@logger.factory', '@config.factory', '@stream_wrapper_manager', '@file_system', '@entity_type.manager', '@strawberry_runner.processor_manager', '@current_user'] + strawberry_runner.postprocessing_insert_subscriber: + class: Drupal\strawberry_runners\EventSubscriber\StrawberryRunnersEventInsertPostProcessingSubscriber + tags: + - {name: event_subscriber} + arguments: ['@string_translation', '@messenger', '@logger.factory', '@config.factory', '@stream_wrapper_manager', '@file_system', '@entity_type.manager', '@strawberry_runner.processor_manager', '@current_user'] + + strawberry_runner.loop: class: Drupal\strawberry_runners\StrawberryRunnersLoopService arguments: ['@file_system', '@entity_type.manager', '@config.factory', '@module_handler', '@lock', '@queue', '@state', '@account_switcher', '@logger.factory', '@plugin.manager.queue_worker', '@datetime.time'] From 47aa34960e5be101109a6eaa2ffd21b42764d624 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 4 Dec 2020 01:06:09 -0500 Subject: [PATCH 17/24] Created an abstract class and the queue worker. Simpler Added file update capabitilies. Processor generates a File entity, the worker adds it to the Node. @giancarlobi need to document and explain how we avoid double processing here. We mass more data around, but it saves us from overprocessing so that is good? --- .../AbstractPostProcessorQueueWorker.php | 631 ++++++++++++++++++ .../IndexPostProcessorQueueWorker.php | 523 +-------------- 2 files changed, 632 insertions(+), 522 deletions(-) create mode 100644 src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php new file mode 100644 index 0000000..20b8513 --- /dev/null +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -0,0 +1,631 @@ +entityTypeManager = $entity_type_manager; + $this->strawberryRunnerProcessorPluginManager = $strawberry_runner_processor_plugin_manager; + $this->fileSystem = $file_system; + $this->streamWrapperManager = $stream_wrapper_manager; + $this->keyValue = $key_value; + $this->logger = $logger; + $this->parseModeManager = $parse_mode_manager; + } + + /** + * Implementation of the container interface to allow dependency injection. + * + * @param \Symfony\Component\DependencyInjection\ContainerInterface $container + * @param array $configuration + * @param string $plugin_id + * @param mixed $plugin_definition + * + * @return static + */ + public static function create(ContainerInterface $container, array $configuration, $plugin_id, $plugin_definition) { + return new static( + empty($configuration) ? [] : $configuration, + $plugin_id, + $plugin_definition, + $container->get('entity_type.manager'), + $container->get('strawberry_runner.processor_manager'), + $container->get('file_system'), + $container->get('stream_wrapper_manager'), + $container->get('keyvalue'), + $container->get('logger.channel.strawberry_runners'), + $container->get('plugin.manager.search_api.parse_mode') + ); + } + + /** + * Get the extractor plugin. + * + * @return object + * The plugin. + * + * @throws \Drupal\Component\Plugin\Exception\PluginException + */ + protected function getProcessorPlugin($plugin_config_entity_id) { + // Get extractor configuration. + /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntityInterface */ + $plugin_config_entity = $this->entityTypeManager->getStorage( + 'strawberry_runners_postprocessor' + )->load($plugin_config_entity_id); + + if ($plugin_config_entity->isActive()) { + $entity_id = $plugin_config_entity->id(); + $configuration_options = $plugin_config_entity->getPluginconfig(); + $configuration_options['configEntity'] = $entity_id; + /* @var \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface $plugin_instance */ + $plugin_instance = $this->strawberryRunnerProcessorPluginManager->createInstance( + $plugin_config_entity->getPluginid(), + $configuration_options + ); + return $plugin_instance; + } + } + + + /** + * Gets all Children of the currently being processed Processor Plugin + * + * @param string $current_id + * + * @return array + */ + private function getChildProcessorIds(string $plugin_config_entity_id):array { + /* @var $plugin_config_entities \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity[] */ + $plugin_config_entities = $this->entityTypeManager->getListBuilder('strawberry_runners_postprocessor') + ->load(); + $active_plugins = []; + // This kids should be cached; + // We basically want here what type of processor this is and its input_argument and input_options + $plugin_definitions = $this->strawberryRunnerProcessorPluginManager->getDefinitions(); + + error_log('getting child processors'); + foreach ($plugin_config_entities as $plugin_config_entity) { + // Only get first level (no Parents) and Active ones. + if ($plugin_config_entity->isActive() && $plugin_config_entity->getParent() == $plugin_config_entity_id) { + $active_plugins[] = [ + 'config_entity' => $plugin_config_entity, + 'plugin_definition' => $plugin_definitions[$plugin_config_entity->getPluginid()] + ]; + } + } + return $active_plugins; + } + + /** + * {@inheritdoc} + */ + public function processItem($data) { + + $processor_instance = $this->getProcessorPlugin($data->plugin_config_entity_id); + + $processor_config = $processor_instance->getConfiguration(); + + if (!isset($data->fid) || $data->fid == NULL || !isset($data->nid) || $data->nid == NULL || !is_array($data->metadata)) { + return; + } + $file = $this->entityTypeManager->getStorage('file')->load($data->fid); + // 0 byte files have checksum, check what it is! + if ($file === NULL || !isset($data->metadata['checksum'])) { + error_log('Sorry the file does not exist or has no checksum yet. We really need the checksum'); + return; + } + + //We only deal with NODES. + $entity = $this->entityTypeManager->getStorage('node') + ->load($data->nid); + + if (!$entity) { + error_log('Sorry the Node ID passed to this processor does not exist.'); + } + + //@TODO should we wrap this around a try catch? + $filelocation = $this->ensureFileAvailability($file); + + if ($filelocation === NULL) { + return; + } + // Means we could pass also a file directly anytime + $data->filelocation = $filelocation; + $tobeindexed = FALSE; + error_log(in_array('searchapi',$processor_config['output_destination'])); + // make all this options constants + if (array_key_exists('searchapi', $processor_config['output_destination']) && $processor_config['output_destination']['searchapi'] == 'searchapi') { + error_log("processor says this goes into Solr"); + $tobeindexed = TRUE; + } + + // Only applies to those that will be indexed + if ($tobeindexed) { + try { + // Get which indexes have our StrawberryfieldFlavorDatasource enabled! + $indexes = StrawberryfieldFlavorDatasource::getValidIndexes(); + $keyvalue_collection = 'Strawberryfield_flavor_datasource_temp'; + $item_ids = []; + $inindex = 1; + $input_property = $processor_instance->getPluginDefinition()['input_property']; + $input_argument = $processor_instance->getPluginDefinition()['input_argument']; + + // @TODO If argument is not here, do we return?? + $data->{$input_argument} = isset($data->{$input_argument}) ? $data->{$input_argument} : 1; + + if (is_a($entity, TranslatableInterface::class)) { + $translations = $entity->getTranslationLanguages(); + foreach ($translations as $translation_id => $translation) { + //@TODO here, the number 1 needs to come from the sequence. + $item_id = $entity->id() . ':' . $data->{$input_argument} . ':' . $translation_id . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id; + // a single 0 as return will force us to reindex. + $inindex = $inindex * $this->flavorInSolrIndex($item_id, $data->metadata['checksum'], $indexes); + $item_ids[] = $item_id; + } + } + + // Check if we already have this entry in Solr + if ($inindex !== 0) { + error_log('Already in search index, skipping'); + } + $inkeystore = TRUE; + // Skip file if element for every language is found in key_value collection. + foreach($item_ids as $item_id) { + $processed_data = $this->keyValue->get($keyvalue_collection) + ->get($item_id); + if (empty($processed_data) || !isset($processed_data->checksum) || + empty($processed_data->checksum) || + $processed_data->checksum != $data->metadata['checksum']) { + $inkeystore = $inkeystore && FALSE; + } + } + //@TODO allow a force in case of corrupted key value? Partial output + // Extragenoxus weird data? + if ($tobeindexed && ($inindex === 0 || $inkeystore === FALSE) || + $data->force == TRUE) { + // Extract file and save it in key_value collection. + $io = $this->invokeProcessor($processor_instance, $data); + + // Check if $io->output exists? + $toindex = new \stdClass(); + $toindex->fulltext = $io->output->searchapi; + $toindex->checksum = $data->metadata['checksum']; + + $datasource_id = 'strawberryfield_flavor_datasource'; + foreach ($indexes as $index) { + // For each language we do this + // Eventually we will want to have different outputs per language? + // But maybe not for HOCR. since the doc will be the same. + foreach($item_ids as $item_id) { + error_log('processing just run'); + error_log('writing to keyvalue'); + error_log($item_id); + $this->keyValue->get($keyvalue_collection) + ->set($item_id, $toindex); + } + $index->trackItemsInserted($datasource_id, $item_ids); + } + } + } catch (\Exception $exception) { + $message_params = [ + '@file_id' => $data->fid, + '@entity_id' => $data->nid, + '@message' => $exception->getMessage(), + ]; + if (!isset($data->extract_attempts)) { + $data->extract_attempts = 0; + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed with message: @message File id @file_id at Node @entity_id.', $message_params); + } + if ($data->extract_attempts < 3) { + $data->extract_attempts++; + \Drupal::queue('strawberryrunners_process_index')->createItem($data); + } + else { + $message_params = [ + '@file_id' => $data->fid, + '@entity_id' => $data->nid, + ]; + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed after 3 attempts File Id @file_id at Node @entity_id.', $message_params); + } + } + } + else { + // This will not + $io = $this->invokeProcessor($processor_instance, $data); + error_log('we do not need to index this'); + error_log(var_export($io, true)); + error_log('we do not need to index this'); + } + // Means we got a file back from the processor + if (isset($io->output->file) && !empty($io->output->file)) { + $this->updateNode($entity, $data, $io); + error_log('we got a file'); + } + // Chains a new Processor into the QUEUE, if there are any children + if (isset($io->output->plugin) && !empty($io->output->plugin)) { + error_log('Time to check on children'); + error_log($data->plugin_config_entity_id); + $childprocessors = $this->getChildProcessorIds($data->plugin_config_entity_id); + error_log(print_r($childprocessors,true)); + foreach($childprocessors as $plugin_info) { + $childdata = clone $data; // So we do not touch original data + /* @var $strawberry_runners_postprocessor_config \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity */ + $postprocessor_config_entity = $plugin_info['config_entity']; + $postprocessor_plugin_definition = $plugin_info['plugin_definition']; + $input_property = $plugin_info['plugin_definition']['input_property']; + $input_argument = $plugin_info['plugin_definition']['input_argument']; + //@TODO check if this are here and not null! + // $io->ouput will contain whatever the output is + // We will check if the child processor + // contains a property contained in $output + // If so we check if there is a single value or multiple ones + // For each we enqueue a child using that property in its data + + // Possible input properties: + // - Can come from the original Data (most likely) + // - May be overriden by the $io->output, e.g when a processor generates a file that is not part of any node + $input_property_value = isset($io->output->plugin) && isset($io->output->plugin[$input_property]) ? $io->output->plugin[$input_property] : $data->{$input_property}; + // Warning Diego. This may lead to a null + $childdata->{$input_property} = $input_property_value; + $childdata->plugin_config_entity_id = $postprocessor_config_entity->id(); + $input_argument_value = isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ? $io->output->plugin[$input_argument] : $data->{$input_argument}; + error_log(print_r($input_argument_value,true)); + if (is_array($input_argument_value)) { + foreach ($input_argument_value as $value) { + // Here is the catch. + // Output properties may be many + // Input Properties matching always need to be one + if (!is_array($value)) { + $childdata->{$input_argument} = $value; + error_log("should add to queue {$childdata->plugin_config_entity_id}"); + error_log(var_export($childdata,true)); + \Drupal::queue('strawberryrunners_process_index') + ->createItem($childdata); + } + } + } + } + } + + } + + /** + * Move file to local to if needed process. + * + * @param \Drupal\file\FileInterface $file + * The File URI to look at. + * + * @return array + * Output of processing chain for a particular file. + */ + private function ensureFileAvailability(FileInterface $file) { + $uri = $file->getFileUri(); + // Local stream. + $cache_key = md5($uri); + // Check first if the file is already around in temp? + // @TODO can be sure its the same one? Ideas? + if (is_readable( + $this->fileSystem->realpath( + 'temporary://sbr_' . $cache_key . '_' . basename($uri) + ) + )) { + $templocation = $this->fileSystem->realpath( + 'temporary://sbr_' . $cache_key . '_' . basename($uri) + ); + } + else { + $templocation = $this->fileSystem->copy( + $uri, + 'temporary://sbr_' . $cache_key . '_' . basename($uri), + FileSystemInterface::EXISTS_REPLACE + ); + $templocation = $this->fileSystem->realpath( + $templocation + ); + } + + if (!$templocation) { + $this->loggerFactory->get('strawberry_runners')->warning( + 'Could not adquire a local accessible location for text extraction for file with URL @fileurl', + [ + '@fileurl' => $file->getFileUri(), + ] + ); + return FALSE; + } else { + return $templocation; + } + } + + /** + * Helper method to get the real path from an uri. + * + * @param string $uri + * The URI of the file, e.g. public://directory/file.jpg. + * + * @return mixed + * The real path to the file if it is a local file. An URL otherwise. + */ + public function getRealpath($uri) { + $wrapper = $this->streamWrapperManager->getViaUri($uri); + $scheme = $this->streamWrapperManager->getScheme($uri); + $local_wrappers = $this->streamWrapperManager->getWrappers(StreamWrapperInterface::LOCAL); + if (in_array($scheme, array_keys($local_wrappers))) { + return $wrapper->realpath(); + } + else { + return $wrapper->getExternalUrl(); + } + } + + /** + * This method actually invokes the processor. + * + * @param StrawberryRunnersPostProcessorPluginInterface $processor_instance + * @param \stdClass $data + * + * @return \stdClass + */ + private function invokeProcessor(StrawberryRunnersPostProcessorPluginInterface $processor_instance, \stdClass $data): \stdClass { + + $input_property = $processor_instance->getPluginDefinition()['input_property']; + $input_argument = $processor_instance->getPluginDefinition()['input_argument']; + + $io = new \stdClass(); + $input = new \stdClass(); + + // @NOTE: this is the only place where we just pass filelocation fixed instead of the + // actual property named $input_property. Which may be weird? + $input->{$input_property} = $data->filelocation; + $input->{$input_argument} = isset($data->{$input_argument}) ? $data->{$input_argument} : 1; + // The Node UUID + $input->nuuid = $data->nuuid; + // All the rest of the associated Metadata in an as:structure + $input->metadata = $data->metadata; + $io->input = $input; + $io->output = NULL; + //@TODO implement the TEST and BENCHMARK logic here + // RUN should return exit codes so we can know if something failed + // And totally discard indexing. + $extracted_data = $processor_instance->run($io, StrawberryRunnersPostProcessorPluginInterface::PROCESS); + return $io; + } + + /** + * Checks Search API indexes for an Document ID and Checksum Match + * + * @param string $key + * @param string $checksum + * @param array $indexes + * + * @throws \Drupal\Component\Plugin\Exception\PluginException + * @throws \Drupal\search_api\SearchApiException + */ + public function flavorInSolrIndex(string $key, string $checksum, array $indexes): int { + /* @var \Drupal\search_api\IndexInterface[] $indexes */ + + $count = 0; + foreach ($indexes as $search_api_index) { + + // Create the query. + $query = $search_api_index->query([ + 'limit' => 1, + 'offset' => 0, + ]); + + /*$query->setFulltextFields([ + 'title', + 'body', + 'filename', + 'saa_field_file_document', + 'saa_field_file_news', + 'saa_field_file_page' + ]);*/ + //$parse_mode = $this->parseModeManager->createInstance('direct'); + $parse_mode = $this->parseModeManager->createInstance('terms'); + $query->setParseMode($parse_mode); + // $parse_mode->setConjunction('OR'); + // $query->keys($search); + $query->sort('search_api_relevance', 'DESC'); + + $query->addCondition('search_api_id', 'strawberryfield_flavor_datasource/'.$key) + ->addCondition('search_api_datasource', 'strawberryfield_flavor_datasource') + ->addCondition('checksum', $checksum); + //$query = $query->addCondition('ss_checksum', $checksum); + // If we allow processing here Drupal adds Content Access Check + // That does not match our Data Source \Drupal\search_api\Plugin\search_api\processor\ContentAccess + // we get this filter (see 2nd) + /* + * array ( + 0 => 'ss_search_api_id:"strawberryfield_flavor_datasource/2006:1:en:3dccdb09-f79f-478e-81c5-0bb680c3984e:ocr"', + 1 => 'ss_search_api_datasource:"strawberryfield_flavor_datasource"', + 2 => '{!tag=content_access,content_access_enabled,content_access_grants}(ss_search_api_datasource:"entity:file" (+(bs_status:"true" bs_status_2:"true") +(sm_node_grants:"node_access_all:0" sm_node_grants:"node_access__all")))', + 3 => '+index_id:default_solr_index +hash:1evb7z', + 4 => 'ss_search_api_language:("en" "und" "zxx")', + ), + */ + // Another solution would be to make our conditions all together an OR + // But no post processing here is also good, faster and we just want + // to know if its there or not. + $query->setProcessingLevel(QueryInterface::PROCESSING_NONE); + $results = $query->execute(); + + // $solr_response = $results->getExtraData('search_api_solr_response'); + // In case of more than one Index with the same Data Source we accumulate + $count = $count + (int) $results->getResultCount(); + + } + // This is a good one. If i have multiple indexes, but one is missing the i assume + // reprocessing is needed + // But if not, then i return 1, which means we have them all + // FUTURE thinking is the best. + $return = ($count == count($indexes)) ? 1 : 0; + return $return; + // Keys we need in the Search API + // - ss_search_api_id == $key + // A checksum field == Should be configurable? + // Let's start by naming it checksum? If not present we may trigger some Logger/alert? + // Or maybe we can use D8/D9 Status mechanic to let the user know this module + // needs it in the data flavor. + } + + /** + * Updates a node with data passed from a processors io and original data + * + * @param \Drupal\Core\Entity\ContentEntityInterface $entity + * @param \stdClass $data + * @param \stdClass $io + * + * @throws \Drupal\Core\TypedData\Exception\MissingDataException + */ + public function updateNode(ContentEntityInterface $entity, \stdClass $data, \stdClass $io) { + error_log(print_r($data,true)); + error_log(print_r($io,true)); + /** @var $itemfield \Drupal\strawberryfield\Plugin\Field\FieldType\StrawberryFieldItem */ + + $itemfield = $entity->get($data->field_name)->get($data->field_delta); + $field_content = $itemfield->provideDecoded(TRUE); + if (!isset($field_content['ap:entitymapping']['entity:file']) || + !in_array('flv:'.$data->plugin_config_entity_id, $field_content['ap:entitymapping']['entity:file'])) { + $field_content['ap:entitymapping']['entity:file'][] = 'flv:'.$data->plugin_config_entity_id; + } + + //$oldfiles = $this->entityTypeManager->getStorage('file')->loadByProperties(['uri' => $io->output->file]); + //$newfile = $this->entityTypeManager->getStorage('file')->delete($oldfiles); + + $newfile = $this->entityTypeManager->getStorage('file')->create([ + 'uri' => $io->output->file, + 'status' => 0, + ]); + $uniqueid = $data->asstructure_uniqueid; + $jsonkey = $data->asstructure_key; + try { + $newfile->save(); + $newfile->id(); + $field_content['flv:'.$data->plugin_config_entity_id][] = (int) $newfile->id(); + $field_content['flv:'.$data->plugin_config_entity_id] = array_unique($field_content['flv:'.$data->plugin_config_entity_id]); + $field_content[$jsonkey][$uniqueid]['flv:'.$data->plugin_config_entity_id] = $this->addActivityStream($data->plugin_config_entity_id); + $itemfield->setMainValueFromArray($field_content); + // Should we check decide on this? Safer is a new revision, but also an overhead + // $entity->setNewRevision(FALSE); + $entity->save(); + } + catch (\Exception $exception) { + $message_params = [ + '@file_id' => $data->fid, + '@entity_id' => $data->nid, + '@newfile_path' => $io->output->file, + '@message' => $exception->getMessage(), + ]; + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed to update Node and add @newfile_path with message: @message File id @file_id at Node @entity_id.', $message_params); + } + + } + protected function addActivityStream($name = NULL) { + + // We use this to keep track of the webform used to create/update the field's json + $eventBody = [ + 'summary' => 'Generator', + 'endTime' => date('c'), + ]; + + $actor_properties = [ + 'name' => $name ?: 'NaW', + ]; + $event_type = ActivityStream::ASTYPES['Create']; + + $activitystream = new ActivityStream($event_type, $eventBody); + + $activitystream->addActor(ActivityStream::ACTORTYPES['Service'], $actor_properties); + return $activitystream->getAsBody()?:[]; + + } + + + +} diff --git a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php index 1d9cb23..eb2e126 100644 --- a/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php @@ -12,8 +12,6 @@ use Drupal\Core\Entity\TranslatableInterface; use Drupal\Core\File\FileSystemInterface; use Drupal\Core\KeyValueStore\KeyValueFactoryInterface; -use Drupal\Core\Plugin\ContainerFactoryPluginInterface; -use Drupal\Core\Queue\QueueWorkerBase; use Drupal\Core\StreamWrapper\StreamWrapperInterface; use Drupal\Core\StreamWrapper\StreamWrapperManagerInterface; use Drupal\file\FileInterface; @@ -36,524 +34,5 @@ * cron = {"time" = 180} * ) */ -class IndexPostProcessorQueueWorker extends QueueWorkerBase implements ContainerFactoryPluginInterface { - - /** - * Drupal\Core\Entity\EntityTypeManager definition. - * - * @var \Drupal\Core\Entity\EntityTypeManager - */ - protected $entityTypeManager; - - /** - * @var \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginManager - */ - private $strawberryRunnerProcessorPluginManager; - - /** - * The file system service. - * - * @var \Drupal\Core\File\FileSystemInterface - */ - protected $fileSystem; - - /** - * The stream wrapper manager. - * - * @var \Drupal\Core\StreamWrapper\StreamWrapperManagerInterface - */ - protected $streamWrapperManager; - - /** - * Key value service. - * - * @var \Drupal\Core\KeyValueStore\KeyValueFactoryInterface - */ - protected $keyValue; - - /** - * The logger service. - * - * @var \Psr\Log\LoggerInterface - */ - protected $logger; - - /** - * The parse mode manager. - * - * @var \Drupal\search_api\ParseMode\ParseModePluginManager - */ - protected $parseModeManager; - - /** - * Constructor. - * - * @param array $configuration - * @param string $plugin_id - * @param mixed $plugin_definition - * @param \Drupal\Core\Entity\EntityTypeManager $entity_field_manager - * @param \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginManager $strawberry_runner_processor_plugin_manager - */ - public function __construct(array $configuration, $plugin_id, $plugin_definition, EntityTypeManagerInterface $entity_type_manager, StrawberryRunnersPostProcessorPluginManager $strawberry_runner_processor_plugin_manager, FileSystemInterface $file_system, StreamWrapperManagerInterface $stream_wrapper_manager, KeyValueFactoryInterface $key_value, LoggerInterface $logger, ParseModePluginManager $parse_mode_manager) { - parent::__construct($configuration, $plugin_id, $plugin_definition); - $this->entityTypeManager = $entity_type_manager; - $this->strawberryRunnerProcessorPluginManager = $strawberry_runner_processor_plugin_manager; - $this->fileSystem = $file_system; - $this->streamWrapperManager = $stream_wrapper_manager; - $this->keyValue = $key_value; - $this->logger = $logger; - $this->parseModeManager = $parse_mode_manager; - } - - /** - * Implementation of the container interface to allow dependency injection. - * - * @param \Symfony\Component\DependencyInjection\ContainerInterface $container - * @param array $configuration - * @param string $plugin_id - * @param mixed $plugin_definition - * - * @return static - */ - public static function create(ContainerInterface $container, array $configuration, $plugin_id, $plugin_definition) { - return new static( - empty($configuration) ? [] : $configuration, - $plugin_id, - $plugin_definition, - $container->get('entity_type.manager'), - $container->get('strawberry_runner.processor_manager'), - $container->get('file_system'), - $container->get('stream_wrapper_manager'), - $container->get('keyvalue'), - $container->get('logger.channel.strawberry_runners'), - $container->get('plugin.manager.search_api.parse_mode') - ); - } - - /** - * Get the extractor plugin. - * - * @return object - * The plugin. - * - * @throws \Drupal\Component\Plugin\Exception\PluginException - */ - protected function getProcessorPlugin($plugin_config_entity_id) { - // Get extractor configuration. - /* @var $plugin_config_entity \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntityInterface */ - $plugin_config_entity = $this->entityTypeManager->getStorage( - 'strawberry_runners_postprocessor' - )->load($plugin_config_entity_id); - - if ($plugin_config_entity->isActive()) { - $entity_id = $plugin_config_entity->id(); - $configuration_options = $plugin_config_entity->getPluginconfig(); - $configuration_options['configEntity'] = $entity_id; - /* @var \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface $plugin_instance */ - $plugin_instance = $this->strawberryRunnerProcessorPluginManager->createInstance( - $plugin_config_entity->getPluginid(), - $configuration_options - ); - return $plugin_instance; - } - } - - - /** - * Gets all Children of the currently being processed Processor Plugin - * - * @param string $current_id - * - * @return array - */ - private function getChildProcessorIds(string $plugin_config_entity_id):array { - /* @var $plugin_config_entities \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity[] */ - $plugin_config_entities = $this->entityTypeManager->getListBuilder('strawberry_runners_postprocessor') - ->load(); - $active_plugins = []; - // This kids should be cached; - // We basically want here what type of processor this is and its input_argument and input_options - $plugin_definitions = $this->strawberryRunnerProcessorPluginManager->getDefinitions(); - - error_log('getting child processors'); - foreach ($plugin_config_entities as $plugin_config_entity) { - // Only get first level (no Parents) and Active ones. - if ($plugin_config_entity->isActive() && $plugin_config_entity->getParent() == $plugin_config_entity_id) { - $active_plugins[] = [ - 'config_entity' => $plugin_config_entity, - 'plugin_definition' => $plugin_definitions[$plugin_config_entity->getPluginid()] - ]; - } - } - return $active_plugins; - } - - /** - * {@inheritdoc} - */ - public function processItem($data) { - - $processor_instance = $this->getProcessorPlugin($data->plugin_config_entity_id); - // Read the Input Properties and Input Arguments of the current Processor - - // Now check if there are any child? - error_log('config for this processor'); - $processor_config = $processor_instance->getConfiguration(); - - if (!isset($data->fid) || $data->fid == NULL || !isset($data->nid) || $data->nid == NULL || !is_array($data->metadata)) { - return; - } - $file = $this->entityTypeManager->getStorage('file')->load($data->fid); - // 0 byte files have checksum, check what it is! - if ($file === NULL || !isset($data->metadata['checksum'])) { - error_log('Sorry the file does not exist or has no checksum yet. We really need the checksum'); - return; - } - //@TODO should we wrap this around a try catch? - $filelocation = $this->ensureFileAvailability($file); - - if ($filelocation === NULL) { - return; - } - // Means we could pass also a file directly anytime - $data->filelocation = $filelocation; - - if (isset($processor_config['output_destination']['searchapi']) && $processor_config['output_destination']['searchapi'] == 'searchapi') { - $tobeindexed = TRUE; - } - - - // Only applies to those that will be indexed - if ($tobeindexed) { - try { - // Get which indexes have our StrawberryfieldFlavorDatasource enabled! - $indexes = StrawberryfieldFlavorDatasource::getValidIndexes(); - $keyvalue_collection = 'Strawberryfield_flavor_datasource_temp'; - - //We only deal with NODES. - $entity = $this->entityTypeManager->getStorage('node') - ->load($data->nid); - - if (!$entity) { - return; - } - - $item_ids = []; - $inindex = 1; - $input_property = $processor_instance->getPluginDefinition()['input_property']; - $input_argument = $processor_instance->getPluginDefinition()['input_argument']; - - // @TODO If argument is not here, do we return?? - $data->{$input_argument} = isset($data->{$input_argument}) ? $data->{$input_argument} : 1; - - if (is_a($entity, TranslatableInterface::class)) { - $translations = $entity->getTranslationLanguages(); - foreach ($translations as $translation_id => $translation) { - //@TODO here, the number 1 needs to come from the sequence. - $item_id = $entity->id() . ':' . $data->{$input_argument} . ':' . $translation_id . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id; - // a single 0 as return will force us to reindex. - $inindex = $inindex * $this->flavorInSolrIndex($item_id, $data->metadata['checksum'], $indexes); - $item_ids[] = $item_id; - } - } - - // Check if we already have this entry in Solr - if ($inindex !== 0) { - error_log('Already in search index, skipping'); - } - $inkeystore = TRUE; - // Skip file if element for every language is found in key_value collection. - foreach($item_ids as $item_id) { - $processed_data = $this->keyValue->get($keyvalue_collection) - ->get($item_id); - if (empty($processed_data) || !isset($processed_data->checksum) || - empty($processed_data->checksum) || - $processed_data->checksum != $data->metadata['checksum']) { - $inkeystore = $inkeystore && FALSE; - } - } - //@TODO allow a force in case of corrupted key value? Partial output - // Extragenous weird data? - if ($tobeindexed && ($inindex === 0 || $inkeystore === FALSE) || - $data->force == TRUE) { - // Extract file and save it in key_value collection. - $io = $this->invokeProcessor($processor_instance, $data); - - // Check if $io->output exists? - $toindex = new \stdClass(); - $toindex->fulltext = $io->output->searchapi; - $toindex->checksum = $data->metadata['checksum']; - - $datasource_id = 'strawberryfield_flavor_datasource'; - foreach ($indexes as $index) { - // For each language we do this - // Eventually we will want to have different outputs per language? - // But maybe not for HOCR. since the doc will be the same. - foreach($item_ids as $item_id) { - error_log('processing just run'); - error_log('writing to keyvalue'); - error_log($item_id); - $this->keyValue->get($keyvalue_collection) - ->set($item_id, $toindex); - } - $index->trackItemsInserted($datasource_id, $item_ids); - } - } - } catch (\Exception $exception) { - $message_params = [ - '@file_id' => $data->fid, - '@entity_id' => $data->nid, - '@message' => $exception->getMessage(), - ]; - if (!isset($data->extract_attempts)) { - $data->extract_attempts = 0; - $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed with message: @message File id @file_id at Node @entity_id.', $message_params); - } - if ($data->extract_attempts < 3) { - $data->extract_attempts++; - \Drupal::queue('strawberryrunners_process_index')->createItem($data); - } - else { - $message_params = [ - '@file_id' => $data->fid, - '@entity_id' => $data->nid, - ]; - $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed after 3 attempts File Id @file_id at Node @entity_id.', $message_params); - } - } - } - else { - // This will not - $io = $this->invokeProcessor($processor_instance, $data); - error_log('we do not need to index this'); - error_log(var_export($io, true)); - error_log('we do not need to index this'); - } - - if (isset($io->output->plugin) && !empty($io->output->plugin)) { - error_log('Time to check on children'); - error_log($data->plugin_config_entity_id); - $childprocessors = $this->getChildProcessorIds($data->plugin_config_entity_id); - error_log(print_r($childprocessors,true)); - foreach($childprocessors as $plugin_info) { - $childdata = clone $data; // So we do not touch original data - /* @var $strawberry_runners_postprocessor_config \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity */ - $postprocessor_config_entity = $plugin_info['config_entity']; - $postprocessor_plugin_definition = $plugin_info['plugin_definition']; - $input_property = $plugin_info['plugin_definition']['input_property']; - $input_argument = $plugin_info['plugin_definition']['input_argument']; - //@TODO check if this are here and not null! - // $io->ouput will contain whatever the output is - // We will check if the child processor - // contains a property contained in $output - // If so we check if there is a single value or multiple ones - // For each we enqueue a child using that property in its data - - // Possible input properties: - // - Can come from the original Data (most likely) - // - May be overriden by the $io->output, e.g when a processor generates a file that is not part of any node - $input_property_value = isset($io->output->plugin) && isset($io->output->plugin[$input_property]) ? $io->output->plugin[$input_property] : $data->{$input_property}; - // Warning Diego. This may lead to a null - $childdata->{$input_property} = $input_property_value; - $childdata->plugin_config_entity_id = $postprocessor_config_entity->id(); - $input_argument_value = isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ? $io->output->plugin[$input_argument] : $data->{$input_argument}; - error_log(print_r($input_argument_value,true)); - if (is_array($input_argument_value)) { - foreach ($input_argument_value as $value) { - // Here is the catch. - // Output properties may be many - // Input Properties matching always need to be one - if (!is_array($value)) { - $childdata->{$input_argument} = $value; - error_log("should add to queue {$childdata->plugin_config_entity_id}"); - error_log(var_export($childdata,true)); - \Drupal::queue('strawberryrunners_process_index') - ->createItem($childdata); - } - } - } - } - } - - } - - /** - * Move file to local to if needed process. - * - * @param \Drupal\file\FileInterface $file - * The File URI to look at. - * - * @return array - * Output of processing chain for a particular file. - */ - private function ensureFileAvailability(FileInterface $file) { - $uri = $file->getFileUri(); - // Local stream. - $cache_key = md5($uri); - // Check first if the file is already around in temp? - // @TODO can be sure its the same one? Ideas? - if (is_readable( - $this->fileSystem->realpath( - 'temporary://sbr_' . $cache_key . '_' . basename($uri) - ) - )) { - $templocation = $this->fileSystem->realpath( - 'temporary://sbr_' . $cache_key . '_' . basename($uri) - ); - } - else { - $templocation = $this->fileSystem->copy( - $uri, - 'temporary://sbr_' . $cache_key . '_' . basename($uri), - FileSystemInterface::EXISTS_REPLACE - ); - $templocation = $this->fileSystem->realpath( - $templocation - ); - } - - if (!$templocation) { - $this->loggerFactory->get('strawberry_runners')->warning( - 'Could not adquire a local accessible location for text extraction for file with URL @fileurl', - [ - '@fileurl' => $file->getFileUri(), - ] - ); - return FALSE; - } else { - return $templocation; - } - } - - /** - * Helper method to get the real path from an uri. - * - * @param string $uri - * The URI of the file, e.g. public://directory/file.jpg. - * - * @return mixed - * The real path to the file if it is a local file. An URL otherwise. - */ - public function getRealpath($uri) { - $wrapper = $this->streamWrapperManager->getViaUri($uri); - $scheme = $this->streamWrapperManager->getScheme($uri); - $local_wrappers = $this->streamWrapperManager->getWrappers(StreamWrapperInterface::LOCAL); - if (in_array($scheme, array_keys($local_wrappers))) { - return $wrapper->realpath(); - } - else { - return $wrapper->getExternalUrl(); - } - } - - /** - * This method actually invokes the processor. - * - * @param StrawberryRunnersPostProcessorPluginInterface $processor_instance - * @param \stdClass $data - * - * @return \stdClass - */ - private function invokeProcessor(StrawberryRunnersPostProcessorPluginInterface $processor_instance, \stdClass $data): \stdClass { - - $input_property = $processor_instance->getPluginDefinition()['input_property']; - $input_argument = $processor_instance->getPluginDefinition()['input_argument']; - - $io = new \stdClass(); - $input = new \stdClass(); - - // @NOTE: this is the only place where we just pass filelocation fixed instead of the - // actual property named $input_property. Which may be weird? - $input->{$input_property} = $data->filelocation; - $input->{$input_argument} = isset($data->{$input_argument}) ? $data->{$input_argument} : 1; - // The Node UUID - $input->nuuid = $data->nuuid; - // All the rest of the associated Metadata in an as:structure - $input->metadata = $data->metadata; - $io->input = $input; - $io->output = NULL; - //@TODO implement the TEST and BENCHMARK logic here - // RUN should return exit codes so we can know if something failed - // And totally discard indexing. - $extracted_data = $processor_instance->run($io, StrawberryRunnersPostProcessorPluginInterface::PROCESS); - return $io; - } - - /** - * Checks Search API indexes for an Document ID and Checksum Match - * - * @param string $key - * @param string $checksum - * @param array $indexes - * - * @throws \Drupal\Component\Plugin\Exception\PluginException - * @throws \Drupal\search_api\SearchApiException - */ - public function flavorInSolrIndex(string $key, string $checksum, array $indexes): int { - /* @var \Drupal\search_api\IndexInterface[] $indexes */ - - $count = 0; - foreach ($indexes as $search_api_index) { - - // Create the query. - $query = $search_api_index->query([ - 'limit' => 1, - 'offset' => 0, - ]); - - /*$query->setFulltextFields([ - 'title', - 'body', - 'filename', - 'saa_field_file_document', - 'saa_field_file_news', - 'saa_field_file_page' - ]);*/ - //$parse_mode = $this->parseModeManager->createInstance('direct'); - $parse_mode = $this->parseModeManager->createInstance('terms'); - $query->setParseMode($parse_mode); - // $parse_mode->setConjunction('OR'); - // $query->keys($search); - $query->sort('search_api_relevance', 'DESC'); - - $query->addCondition('search_api_id', 'strawberryfield_flavor_datasource/'.$key) - ->addCondition('search_api_datasource', 'strawberryfield_flavor_datasource') - ->addCondition('checksum', $checksum); - //$query = $query->addCondition('ss_checksum', $checksum); - // If we allow processing here Drupal adds Content Access Check - // That does not match our Data Source \Drupal\search_api\Plugin\search_api\processor\ContentAccess - // we get this filter (see 2nd) - /* - * array ( - 0 => 'ss_search_api_id:"strawberryfield_flavor_datasource/2006:1:en:3dccdb09-f79f-478e-81c5-0bb680c3984e:ocr"', - 1 => 'ss_search_api_datasource:"strawberryfield_flavor_datasource"', - 2 => '{!tag=content_access,content_access_enabled,content_access_grants}(ss_search_api_datasource:"entity:file" (+(bs_status:"true" bs_status_2:"true") +(sm_node_grants:"node_access_all:0" sm_node_grants:"node_access__all")))', - 3 => '+index_id:default_solr_index +hash:1evb7z', - 4 => 'ss_search_api_language:("en" "und" "zxx")', - ), - */ - // Another solution would be to make our conditions all together an OR - // But no post processing here is also good, faster and we just want - // to know if its there or not. - $query->setProcessingLevel(QueryInterface::PROCESSING_NONE); - $results = $query->execute(); - - // $solr_response = $results->getExtraData('search_api_solr_response'); - // In case of more than one Index with the same Data Source we accumulate - $count = $count + (int) $results->getResultCount(); - - } - // This is a good one. If i have multiple indexes, but one is missing the i assume - // reprocessing is needed - // But if not, then i return 1, which means we have them all - // FUTURE thinking is the best. - $return = ($count == count($indexes)) ? 1 : 0; - return $return; - // Keys we need in the Search API - // - ss_search_api_id == $key - // A checksum field == Should be configurable? - // Let's start by naming it checksum? If not present we may trigger some Logger/alert? - // Or maybe we can use D8/D9 Status mechanic to let the user know this module - // needs it in the data flavor. - } +class IndexPostProcessorQueueWorker extends AbstractPostProcessorQueueWorker { } From 10e8631c293ab50d387ce94a4ce0cd395b541c23 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 4 Dec 2020 01:06:45 -0500 Subject: [PATCH 18/24] Update StrawberryRunnersPostProcessorPluginBase.php Gets some constructor assignments so we can get the temp:// folder defined by the system --- ...rawberryRunnersPostProcessorPluginBase.php | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php index 13c6fe7..dff13b9 100644 --- a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php +++ b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php @@ -8,9 +8,11 @@ namespace Drupal\strawberry_runners\Plugin; +use Drupal\Core\Config\ConfigFactoryInterface; use Drupal\Core\Entity\EntityFieldManagerInterface; use Drupal\Core\Form\FormStateInterface; use Drupal\Core\Plugin\ContainerFactoryPluginInterface; +use Drupal\Core\File\Exception\FileException; use Drupal\Component\Plugin\Exception\PluginException; use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface; use Drupal\Core\Plugin\PluginBase; @@ -26,6 +28,12 @@ abstract class StrawberryRunnersPostProcessorPluginBase extends PluginBase imple use PluginWithFormsTrait; + /** + * Temporary directory setup to be used by Drupal + * @var string + */ + protected $temporary_directory; + /** * @var \Drupal\Core\Entity\EntityTypeManagerInterface; */ @@ -48,13 +56,22 @@ public function __construct( $plugin_definition, EntityTypeManagerInterface $entityTypeManager, EntityTypeBundleInfoInterface $entityTypeBundleInfo, - Client $httpClient + Client $httpClient, + ConfigFactoryInterface $config_factory ) { parent::__construct($configuration, $plugin_id, $plugin_definition); $this->entityTypeBundleInfo = $entityTypeBundleInfo; $this->entityTypeManager = $entityTypeManager; $this->setConfiguration($configuration); $this->httpClient = $httpClient; + // For files being processed by a binary, the Queue worker will have made sure + // they are made local + // \Drupal\strawberry_runners\Plugin\QueueWorker\IndexPostProcessorQueueWorker::ensureFileAvailability + $this->temporary_directory = $config_factory->get('system.file') + ->get('path.temporary'); + + + } public static function create(ContainerInterface $container, array $configuration, $plugin_id, $plugin_definition) { @@ -65,7 +82,8 @@ public static function create(ContainerInterface $container, array $configuratio $plugin_definition, $container->get('entity_type.manager'), $container->get('entity_type.bundle.info'), - $container->get('http_client') + $container->get('http_client'), + $container->get('config.factory') ); } From 45ca51496565ec79451a99f98ad1083c5439694f Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 4 Dec 2020 01:07:52 -0500 Subject: [PATCH 19/24] This is the largest change If output as a file is desired a new %output file can be given. A lot of string manipulation to get the file names right back and forth and we may want to clean/generalized and also make some long code into methods we can reuse. But it actually works! WARC into WACZ without any issues! --- .../SystemBinaryPostProcessor.php | 70 ++++++++++++++++--- 1 file changed, 62 insertions(+), 8 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php index f3c8e5f..5cd5347 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php @@ -8,6 +8,7 @@ namespace Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessor; +use Drupal\Core\File\Exception\FileNotExistsException; use Drupal\Core\Form\FormStateInterface; use Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor; use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginBase; @@ -28,6 +29,13 @@ */ class SystemBinaryPostProcessor extends StrawberryRunnersPostProcessorPluginBase{ + /** + * Keeps track of a the expected output file in case there is any of course. + * + * @var string|null + */ + protected $out_file_path = NULL; + /** * {@inheritdoc} */ @@ -108,7 +116,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#type' => 'textfield', '#title' => $this->t('Any additional argument your executable binary requires.'), '#default_value' => !empty($this->getConfiguration()['arguments']) ? $this->getConfiguration()['arguments'] : '%file', - '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file if the executable requires the filename to be passed under a specific argument.'), + '#description' => t('Any arguments your binary requires to run. Use %file as replacement for the file if the executable requires the filename to be passed under a specific argument. Use %outfile if the binary is intended to generate a new file and the output is going to be a file entity. If you know the extension please add it in the form of %outfile.extension'), '#required' => TRUE, ]; @@ -129,6 +137,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#options' => [ 'subkey' => 'In the same Source Metadata, as a child structure of each Processed file', 'ownkey' => 'In the same Source Metadata but inside its own, top level, "as:flavour" subkey based on the given machine name of the current plugin', + 'file' => 'A new file to be attached to the source ADO', 'plugin' => 'As Input for another processor Plugin', 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)' ], @@ -142,8 +151,8 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#title' => $this->t('Timeout in seconds for this process.'), '#default_value' => $this->getConfiguration()['timeout'], '#description' => $this->t('If the process runs out of time it can still be processed again.'), - '#size' => 2, - '#maxlength' => 2, + '#size' => 3, + '#maxlength' => 3, '#min' => 1, ]; $element['weight'] = [ @@ -180,9 +189,11 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug // In this case it will contain an absolute Path to a File. // Needed since this executes locally on the server via SHELL. $input_property = $this->pluginDefinition['input_property']; - $input_argument = $this->pluginDefinition['input_arguments']; + $input_argument = $this->pluginDefinition['input_argument']; // NOT user here? $config = $this->getConfiguration(); + $output_type = $config['output_type']; + $output_destination = $config['output_destination']; $timeout = $config['timeout']; // in seconds // TODO how do we map $input_argument to the callable executable binary? error_log('run system binary'); @@ -202,13 +213,27 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug throw new \Exception("Could not execute {$execstring} or timed out"); } $output = new \stdClass(); - $output->searchapi = $proc_output; - $output->plugin = $proc_output; + + // If this should generate + if (($output_type == 'entity:file') && in_array('file', $output_destination)) { + error_log($this->out_file_path); + if (!file_exists($this->out_file_path)) { + throw new FileNotExistsException('The output file for this processor failed to be generated and was required'); + } + $output->file = "temporary://".substr($this->out_file_path,strlen($this->temporary_directory)+1); + $output->searchapi = NULL; + $output->plugin = $this->out_file_path; + } else { + $output->file = NULL; + $output->searchapi = $proc_output; + $output->plugin = $proc_output; + } $io->output = $output; + } } else { - \throwException(new \InvalidArgumentException); + throw new \InvalidArgumentException('Invalid/misisng arguments for this processor'); } } @@ -223,12 +248,35 @@ public function buildExecutableCommand(\stdClass $io) { $config = $this->getConfiguration(); $execpath = $config['path']; $arguments = $config['arguments']; + $output_type = $config['output_type']; + $output_destination = $config['output_destination']; $command = ''; $input_property = $this->pluginDefinition['input_property']; $input_argument = $this->pluginDefinition['input_argument']; // Sets the default page to 1 if not passed. $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; + $out_file_path = NULL; + //@ todo all this should be base methods, each method should check and validate what is needed + // So one of the output types and output destinations can operate or not + if (($file_path) && ($output_type == 'entity:file') && in_array('file', $output_destination)) { + $extension = ''; + $pos = strpos(utf8_encode($arguments), utf8_encode('%outfile')); + if ( $pos === FALSE) { + error_log('ups, no input?'); + return NULL; + } + //Ok, let's try to get the an extension if there is one + // will be 4 characters after + $extension = substr($arguments, (int) $pos+8,5); + error_log('the extension:'.$extension); + $extension = trim($extension); + $extension = (strpos($extension,'.') === 0) && strlen($extension) >= 4 ? $extension : ''; + + $out_file_path = $this->temporary_directory.'/'.pathinfo($file_path, PATHINFO_FILENAME); + $this->out_file_path = $out_file_path.$extension; + } + error_log('verify!'.(int) \Drupal::service('strawberryfield.utility')->verifyCommand($execpath)); if (empty($file_path)) { return NULL; @@ -238,7 +286,13 @@ public function buildExecutableCommand(\stdClass $io) { error_log('its a command, well well'); $arguments = str_replace('%s','', $arguments); $arguments = str_replace_first('%file','%s', $arguments); - $arguments = sprintf($arguments, $file_path); + $arguments = str_replace_first('%outfile','%s', $arguments); + if ($out_file_path) { + // WE need the original one without extension here. + $arguments = sprintf($arguments, $file_path, $out_file_path); + } else { + $arguments = sprintf($arguments, $file_path); + } error_log($arguments); $command = escapeshellcmd($execpath.' '.$arguments); error_log($command); From 04e5f11bebc9703a6a5dccdb83b8994fc5688798 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 4 Dec 2020 13:55:46 -0500 Subject: [PATCH 20/24] remove deprecated D9 for temp storage Nice catch @giancarlobi --- ...rawberryRunnersPostProcessorPluginBase.php | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php index dff13b9..9b92c7a 100644 --- a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php +++ b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php @@ -10,6 +10,7 @@ use Drupal\Core\Config\ConfigFactoryInterface; use Drupal\Core\Entity\EntityFieldManagerInterface; +use Drupal\Core\File\FileSystemInterface; use Drupal\Core\Form\FormStateInterface; use Drupal\Core\Plugin\ContainerFactoryPluginInterface; use Drupal\Core\File\Exception\FileException; @@ -46,10 +47,17 @@ abstract class StrawberryRunnersPostProcessorPluginBase extends PluginBase imple /** * @var \Drupal\Core\Entity\EntityTypeBundleInfoInterface + * */ - protected $entityTypeBundleInfo; + /** + * The file system. + * + * @var \Drupal\Core\File\FileSystemInterface + */ + protected $fileSystem; + public function __construct( array $configuration, string $plugin_id, @@ -57,7 +65,8 @@ public function __construct( EntityTypeManagerInterface $entityTypeManager, EntityTypeBundleInfoInterface $entityTypeBundleInfo, Client $httpClient, - ConfigFactoryInterface $config_factory + ConfigFactoryInterface $config_factory, + FileSystemInterface $file_system ) { parent::__construct($configuration, $plugin_id, $plugin_definition); $this->entityTypeBundleInfo = $entityTypeBundleInfo; @@ -67,10 +76,8 @@ public function __construct( // For files being processed by a binary, the Queue worker will have made sure // they are made local // \Drupal\strawberry_runners\Plugin\QueueWorker\IndexPostProcessorQueueWorker::ensureFileAvailability - $this->temporary_directory = $config_factory->get('system.file') - ->get('path.temporary'); - - + $this->fileSystem = $file_system; + $this->temporary_directory = $this->fileSystem->getTempDirectory(); } @@ -83,7 +90,8 @@ public static function create(ContainerInterface $container, array $configuratio $container->get('entity_type.manager'), $container->get('entity_type.bundle.info'), $container->get('http_client'), - $container->get('config.factory') + $container->get('config.factory'), + $container->get('file_system') ); } From 122e77848f271b1c3eb5058d9d4c959765574834 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 6 Dec 2020 15:33:29 -0500 Subject: [PATCH 21/24] Address @giancarlobi review (comparison operation) and does some general DCS cleanup Still need to remove all the error_logs and make the whole reporting on issues/missing properties more consistent. Tested and this is working on searchapi indexing, File adding and processor chaining. --- config/schema/strawberry_runners.schema.yml | 32 +++++ src/Plugin/.DS_Store | Bin 0 -> 6148 bytes .../AbstractPostProcessorQueueWorker.php | 125 +++++++++++------- .../JsonFileSequencePostProcessor.php | 11 +- .../OcrPostProcessor.php | 2 +- .../SystemBinaryPostProcessor.php | 2 +- .../WarcExtractionPostProcessor.php | 4 +- ...rawberryRunnersPostProcessorPluginBase.php | 2 +- ...rryRunnersPostProcessorPluginInterface.php | 13 +- 9 files changed, 133 insertions(+), 58 deletions(-) create mode 100644 src/Plugin/.DS_Store diff --git a/config/schema/strawberry_runners.schema.yml b/config/schema/strawberry_runners.schema.yml index 074595b..4c101fe 100644 --- a/config/schema/strawberry_runners.schema.yml +++ b/config/schema/strawberry_runners.schema.yml @@ -110,3 +110,35 @@ strawberryfield_runners.strawberry_runners_postprocessor.ocr: weight: type: integer label: 'Order or execution in the global chain' +strawberryfield_runners.strawberry_runners_postprocessor.filesequence: + type: config_object + label: 'Strawberry Runners Post Processor Config Entity JSON sequence specific config' + mapping: + source_type: + type: string + label: 'The type of Source Data this Processor works on' + ado_type: + type: string + label: 'DO type(s) to limit this Processor to' + jsonkey: + type: sequence + label: 'The JSON key(s) containing the desired Source File(s)' + sequence: + - type: string + mime_type: + type: string + label: 'Mimetypes(s) to limit this Processor to' + output_type: + type: string + label: 'The expected and desired output of this processor' + output_destination: + type: sequence + label: 'Where and how the output will be used' + sequence: + - type: string + timeout: + type: integer + label: 'Timeout in seconds for this process' + weight: + type: integer + label: 'Order or execution in the global chain' diff --git a/src/Plugin/.DS_Store b/src/Plugin/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..cee7ebadc1db3050ac7e0e6a872973c3a65a6580 GIT binary patch literal 6148 zcmeHKJx{|x41IdMO zS`OWBhkJCidt%SomfjslhyJ7K+xM=0^LMy>NP3fjWFQ$x29kkf;KYE|UdsH&F>NxC z3?u_52K0U?RK*%NINH@gV=Vx2Mz;y))=LnJI*2uJaHNMKo=Wsoi55dVo&FMWHE?kB zbVxKG5+_S^C}O8`|6<{g>XEmN^?yVCrT-r%*V9K))wj`^+yentityTypeManager = $entity_type_manager; $this->strawberryRunnerProcessorPluginManager = $strawberry_runner_processor_plugin_manager; @@ -126,7 +128,7 @@ public static function create(ContainerInterface $container, array $configuratio /** * Get the extractor plugin. * - * @return object + * @return StrawberryRunnersPostProcessorPluginInterface|NULL * The plugin. * * @throws \Drupal\Component\Plugin\Exception\PluginException @@ -149,6 +151,7 @@ protected function getProcessorPlugin($plugin_config_entity_id) { ); return $plugin_instance; } + return NULL; } @@ -159,7 +162,7 @@ protected function getProcessorPlugin($plugin_config_entity_id) { * * @return array */ - private function getChildProcessorIds(string $plugin_config_entity_id):array { + private function getChildProcessorIds(string $plugin_config_entity_id): array { /* @var $plugin_config_entities \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity[] */ $plugin_config_entities = $this->entityTypeManager->getListBuilder('strawberry_runners_postprocessor') ->load(); @@ -174,8 +177,8 @@ private function getChildProcessorIds(string $plugin_config_entity_id):array { if ($plugin_config_entity->isActive() && $plugin_config_entity->getParent() == $plugin_config_entity_id) { $active_plugins[] = [ 'config_entity' => $plugin_config_entity, - 'plugin_definition' => $plugin_definitions[$plugin_config_entity->getPluginid()] - ]; + 'plugin_definition' => $plugin_definitions[$plugin_config_entity->getPluginid()], + ]; } } return $active_plugins; @@ -187,7 +190,10 @@ private function getChildProcessorIds(string $plugin_config_entity_id):array { public function processItem($data) { $processor_instance = $this->getProcessorPlugin($data->plugin_config_entity_id); - + if (!$processor_instance) { + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing aborted because the @processor may be inactive', ['@processor' => $processor_instance->label()]); + return; + } $processor_config = $processor_instance->getConfiguration(); if (!isset($data->fid) || $data->fid == NULL || !isset($data->nid) || $data->nid == NULL || !is_array($data->metadata)) { @@ -216,13 +222,34 @@ public function processItem($data) { } // Means we could pass also a file directly anytime $data->filelocation = $filelocation; - $tobeindexed = FALSE; - error_log(in_array('searchapi',$processor_config['output_destination'])); + + + if (!isset($processor_config['output_destination']) || !is_array($processor_config['output_destination'])) { + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing aborted because there is no output destination setup for @processor', ['@processor' => $processor_instance->label()]); + return; + } + + $enabled_processor_output_types = array_intersect_assoc(StrawberryRunnersPostProcessorPluginInterface::OUTPUT_TYPE, $processor_config['output_destination']); + // make all this options constants - if (array_key_exists('searchapi', $processor_config['output_destination']) && $processor_config['output_destination']['searchapi'] == 'searchapi') { + + $tobeindexed = FALSE; + $tobeupdated = FALSE; + $tobechained = FALSE; + error_log(print_r($enabled_processor_output_types, true)); + if (array_key_exists('searchapi', $enabled_processor_output_types) && $enabled_processor_output_types['searchapi'] === 'searchapi') { error_log("processor says this goes into Solr"); $tobeindexed = TRUE; } + if (array_key_exists('file', $enabled_processor_output_types) && $enabled_processor_output_types['file'] === 'file') { + error_log("processor says this goes into Solr"); + $tobeupdated = TRUE; + } + if (array_key_exists('plugin', $enabled_processor_output_types) && $enabled_processor_output_types['plugin'] === 'plugin') { + error_log("processor says this goes into Solr"); + $tobechained = TRUE; + } + // Only applies to those that will be indexed if ($tobeindexed) { @@ -255,7 +282,7 @@ public function processItem($data) { } $inkeystore = TRUE; // Skip file if element for every language is found in key_value collection. - foreach($item_ids as $item_id) { + foreach ($item_ids as $item_id) { $processed_data = $this->keyValue->get($keyvalue_collection) ->get($item_id); if (empty($processed_data) || !isset($processed_data->checksum) || @@ -266,13 +293,13 @@ public function processItem($data) { } //@TODO allow a force in case of corrupted key value? Partial output // Extragenoxus weird data? - if ($tobeindexed && ($inindex === 0 || $inkeystore === FALSE) || + if (($inindex === 0 || $inkeystore === FALSE) || $data->force == TRUE) { // Extract file and save it in key_value collection. $io = $this->invokeProcessor($processor_instance, $data); // Check if $io->output exists? - $toindex = new \stdClass(); + $toindex = new stdClass(); $toindex->fulltext = $io->output->searchapi; $toindex->checksum = $data->metadata['checksum']; @@ -281,7 +308,7 @@ public function processItem($data) { // For each language we do this // Eventually we will want to have different outputs per language? // But maybe not for HOCR. since the doc will be the same. - foreach($item_ids as $item_id) { + foreach ($item_ids as $item_id) { error_log('processing just run'); error_log('writing to keyvalue'); error_log($item_id); @@ -291,7 +318,7 @@ public function processItem($data) { $index->trackItemsInserted($datasource_id, $item_ids); } } - } catch (\Exception $exception) { + } catch (Exception $exception) { $message_params = [ '@file_id' => $data->fid, '@entity_id' => $data->nid, @@ -303,7 +330,7 @@ public function processItem($data) { } if ($data->extract_attempts < 3) { $data->extract_attempts++; - \Drupal::queue('strawberryrunners_process_index')->createItem($data); + Drupal::queue('strawberryrunners_process_index')->createItem($data); } else { $message_params = [ @@ -318,25 +345,24 @@ public function processItem($data) { // This will not $io = $this->invokeProcessor($processor_instance, $data); error_log('we do not need to index this'); - error_log(var_export($io, true)); + error_log(var_export($io, TRUE)); error_log('we do not need to index this'); } // Means we got a file back from the processor - if (isset($io->output->file) && !empty($io->output->file)) { + if ($tobeupdated && isset($io->output->file) && !empty($io->output->file)) { $this->updateNode($entity, $data, $io); error_log('we got a file'); } // Chains a new Processor into the QUEUE, if there are any children - if (isset($io->output->plugin) && !empty($io->output->plugin)) { + if ($tobechained && isset($io->output->plugin) && !empty($io->output->plugin)) { error_log('Time to check on children'); error_log($data->plugin_config_entity_id); $childprocessors = $this->getChildProcessorIds($data->plugin_config_entity_id); - error_log(print_r($childprocessors,true)); - foreach($childprocessors as $plugin_info) { + error_log(print_r($childprocessors, TRUE)); + foreach ($childprocessors as $plugin_info) { $childdata = clone $data; // So we do not touch original data /* @var $strawberry_runners_postprocessor_config \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity */ $postprocessor_config_entity = $plugin_info['config_entity']; - $postprocessor_plugin_definition = $plugin_info['plugin_definition']; $input_property = $plugin_info['plugin_definition']['input_property']; $input_argument = $plugin_info['plugin_definition']['input_argument']; //@TODO check if this are here and not null! @@ -354,7 +380,7 @@ public function processItem($data) { $childdata->{$input_property} = $input_property_value; $childdata->plugin_config_entity_id = $postprocessor_config_entity->id(); $input_argument_value = isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ? $io->output->plugin[$input_argument] : $data->{$input_argument}; - error_log(print_r($input_argument_value,true)); + error_log(print_r($input_argument_value, TRUE)); if (is_array($input_argument_value)) { foreach ($input_argument_value as $value) { // Here is the catch. @@ -363,15 +389,14 @@ public function processItem($data) { if (!is_array($value)) { $childdata->{$input_argument} = $value; error_log("should add to queue {$childdata->plugin_config_entity_id}"); - error_log(var_export($childdata,true)); - \Drupal::queue('strawberryrunners_process_index') + error_log(var_export($childdata, TRUE)); + Drupal::queue('strawberryrunners_process_index') ->createItem($childdata); } } } } } - } /** @@ -417,7 +442,8 @@ private function ensureFileAvailability(FileInterface $file) { ] ); return FALSE; - } else { + } + else { return $templocation; } } @@ -451,13 +477,13 @@ public function getRealpath($uri) { * * @return \stdClass */ - private function invokeProcessor(StrawberryRunnersPostProcessorPluginInterface $processor_instance, \stdClass $data): \stdClass { + private function invokeProcessor(StrawberryRunnersPostProcessorPluginInterface $processor_instance, stdClass $data): stdClass { $input_property = $processor_instance->getPluginDefinition()['input_property']; $input_argument = $processor_instance->getPluginDefinition()['input_argument']; - $io = new \stdClass(); - $input = new \stdClass(); + $io = new stdClass(); + $input = new stdClass(); // @NOTE: this is the only place where we just pass filelocation fixed instead of the // actual property named $input_property. Which may be weird? @@ -510,10 +536,10 @@ public function flavorInSolrIndex(string $key, string $checksum, array $indexes) $parse_mode = $this->parseModeManager->createInstance('terms'); $query->setParseMode($parse_mode); // $parse_mode->setConjunction('OR'); - // $query->keys($search); + // $query->keys($search); $query->sort('search_api_relevance', 'DESC'); - $query->addCondition('search_api_id', 'strawberryfield_flavor_datasource/'.$key) + $query->addCondition('search_api_id', 'strawberryfield_flavor_datasource/' . $key) ->addCondition('search_api_datasource', 'strawberryfield_flavor_datasource') ->addCondition('checksum', $checksum); //$query = $query->addCondition('ss_checksum', $checksum); @@ -563,17 +589,17 @@ public function flavorInSolrIndex(string $key, string $checksum, array $indexes) * * @throws \Drupal\Core\TypedData\Exception\MissingDataException */ - public function updateNode(ContentEntityInterface $entity, \stdClass $data, \stdClass $io) { - error_log(print_r($data,true)); - error_log(print_r($io,true)); + public function updateNode(ContentEntityInterface $entity, stdClass $data, stdClass $io) { + error_log(print_r($data, TRUE)); + error_log(print_r($io, TRUE)); /** @var $itemfield \Drupal\strawberryfield\Plugin\Field\FieldType\StrawberryFieldItem */ - $itemfield = $entity->get($data->field_name)->get($data->field_delta); - $field_content = $itemfield->provideDecoded(TRUE); - if (!isset($field_content['ap:entitymapping']['entity:file']) || - !in_array('flv:'.$data->plugin_config_entity_id, $field_content['ap:entitymapping']['entity:file'])) { - $field_content['ap:entitymapping']['entity:file'][] = 'flv:'.$data->plugin_config_entity_id; - } + $itemfield = $entity->get($data->field_name)->get($data->field_delta); + $field_content = $itemfield->provideDecoded(TRUE); + if (!isset($field_content['ap:entitymapping']['entity:file']) || + !in_array('flv:' . $data->plugin_config_entity_id, $field_content['ap:entitymapping']['entity:file'])) { + $field_content['ap:entitymapping']['entity:file'][] = 'flv:' . $data->plugin_config_entity_id; + } //$oldfiles = $this->entityTypeManager->getStorage('file')->loadByProperties(['uri' => $io->output->file]); //$newfile = $this->entityTypeManager->getStorage('file')->delete($oldfiles); @@ -587,25 +613,25 @@ public function updateNode(ContentEntityInterface $entity, \stdClass $data, \std try { $newfile->save(); $newfile->id(); - $field_content['flv:'.$data->plugin_config_entity_id][] = (int) $newfile->id(); - $field_content['flv:'.$data->plugin_config_entity_id] = array_unique($field_content['flv:'.$data->plugin_config_entity_id]); - $field_content[$jsonkey][$uniqueid]['flv:'.$data->plugin_config_entity_id] = $this->addActivityStream($data->plugin_config_entity_id); + $field_content['flv:' . $data->plugin_config_entity_id][] = (int) $newfile->id(); + $field_content['flv:' . $data->plugin_config_entity_id] = array_unique($field_content['flv:' . $data->plugin_config_entity_id]); + $field_content[$jsonkey][$uniqueid]['flv:' . $data->plugin_config_entity_id] = $this->addActivityStream($data->plugin_config_entity_id); $itemfield->setMainValueFromArray($field_content); // Should we check decide on this? Safer is a new revision, but also an overhead // $entity->setNewRevision(FALSE); $entity->save(); - } - catch (\Exception $exception) { + } catch (Exception $exception) { $message_params = [ '@file_id' => $data->fid, '@entity_id' => $data->nid, '@newfile_path' => $io->output->file, '@message' => $exception->getMessage(), ]; - $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed to update Node and add @newfile_path with message: @message File id @file_id at Node @entity_id.', $message_params); + $this->logger->log(LogLevel::ERROR, 'Strawberry Runners Processing failed to update Node and add @newfile_path with message: @message File id @file_id at Node @entity_id.', $message_params); } } + protected function addActivityStream($name = NULL) { // We use this to keep track of the webform used to create/update the field's json @@ -617,15 +643,14 @@ protected function addActivityStream($name = NULL) { $actor_properties = [ 'name' => $name ?: 'NaW', ]; - $event_type = ActivityStream::ASTYPES['Create']; + $event_type = ActivityStream::ASTYPES['Create']; $activitystream = new ActivityStream($event_type, $eventBody); $activitystream->addActor(ActivityStream::ACTORTYPES['Service'], $actor_properties); - return $activitystream->getAsBody()?:[]; + return $activitystream->getAsBody() ?: []; } - } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php index d035628..69e09ac 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/JsonFileSequencePostProcessor.php @@ -37,7 +37,7 @@ public function defaultConfiguration() { 'source_type' => 'asstructure', 'mime_type' => ['application/pdf'], 'output_type' => 'json', - 'output_destination' => 'plugin', + 'output_destination' => ['plugin' =>'plugin'], ] + parent::defaultConfiguration(); } @@ -78,6 +78,13 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#required' => TRUE, ]; + // Because we are using the default entity Form, we want to ensure the + // Settings for contains all the values + $element['output_destination'] = [ + '#type' => 'value', + '#default_value' => $this->defaultConfiguration()['output_destination'] + ]; + $element['mime_type'] = [ '#type' => 'textfield', '#title' => $this->t('Mimetypes(s) to limit this Processor to.'), @@ -152,7 +159,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $io->output = $output; } else { - \throwException(new \InvalidArgumentException); + throw new \InvalidArgumentException(\sprintf("Invalid arguments passed to %s",$this->getPluginId())); } } } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index 10628ca..b46f03d 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -41,7 +41,7 @@ public function defaultConfiguration() { 'arguments' => '', 'arguments_tesseract' => '', 'output_type' => 'json', - 'output_destination' => 'subkey', + 'output_destination' => 'searchapi', ] + parent::defaultConfiguration(); } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php index 5cd5347..f2c959c 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php @@ -233,7 +233,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug } } else { - throw new \InvalidArgumentException('Invalid/misisng arguments for this processor'); + throw new \InvalidArgumentException(\sprintf("Invalid arguments passed to %s",$this->getPluginId())); } } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php index b64bc1d..eb8b567 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WarcExtractionPostProcessor.php @@ -174,7 +174,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug } else { - \throwException(new \InvalidArgumentException); + throw new \InvalidArgumentException(\sprintf("Invalid arguments passed to %s",$this->getPluginId())); } } -} \ No newline at end of file +} diff --git a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php index 9b92c7a..8bca1b1 100644 --- a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php +++ b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php @@ -102,6 +102,7 @@ public function defaultConfiguration() { return [ 'jsonkey' => ['as:image'], 'ado_type' => ['Book'], + 'output_destination' => ['plugin' => 'plugin'], // Max time to run in seconds per item. 'timeout' => 10, // Order in which this processor is executed in the chain @@ -134,7 +135,6 @@ public function label() { * {@inheritdoc} */ public function setConfiguration(array $configuration) { - $this->configuration = $configuration + $this->defaultConfiguration(); } diff --git a/src/Plugin/StrawberryRunnersPostProcessorPluginInterface.php b/src/Plugin/StrawberryRunnersPostProcessorPluginInterface.php index 5e1090a..228ee77 100644 --- a/src/Plugin/StrawberryRunnersPostProcessorPluginInterface.php +++ b/src/Plugin/StrawberryRunnersPostProcessorPluginInterface.php @@ -37,6 +37,17 @@ interface StrawberryRunnersPostProcessorPluginInterface extends PluginInspection */ const BENCHMARK = 2; + /** + * Different Types of Outputs a processor can have + */ + const OUTPUT_TYPE = [ + 'subkey' => 'subkey', + 'ownkey' => 'ownkey', + 'file' => 'file', + 'plugin' => 'plugin', + 'searchapi' => 'searchapi' + ]; + /** * Provides a list of Post Processor Plugins * @@ -69,4 +80,4 @@ public function onDependencyRemoval(array $dependencies); */ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS); -} \ No newline at end of file +} From 02abe542b57a6c2f591298e66b0673030b3d558d Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 6 Dec 2020 22:45:58 -0500 Subject: [PATCH 22/24] Address Code review from @giancarlobi --- .../StrawberryRunnersEventInsertPostProcessingSubscriber.php | 2 +- .../StrawberryRunnersEventSavePostProcessingSubscriber.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/EventSubscriber/StrawberryRunnersEventInsertPostProcessingSubscriber.php b/src/EventSubscriber/StrawberryRunnersEventInsertPostProcessingSubscriber.php index b72a173..b367560 100644 --- a/src/EventSubscriber/StrawberryRunnersEventInsertPostProcessingSubscriber.php +++ b/src/EventSubscriber/StrawberryRunnersEventInsertPostProcessingSubscriber.php @@ -106,7 +106,7 @@ class StrawberryRunnersEventInsertPostProcessingSubscriber extends Strawberryfie protected $account; /** - * StrawberryRunnersEventPreSavePostProcessingSubscriber constructor. + * StrawberryRunnersEventInsertPostProcessingSubscriber constructor. * * @param \Drupal\Core\StringTranslation\TranslationInterface $string_translation * @param \Drupal\Core\Messenger\MessengerInterface $messenger diff --git a/src/EventSubscriber/StrawberryRunnersEventSavePostProcessingSubscriber.php b/src/EventSubscriber/StrawberryRunnersEventSavePostProcessingSubscriber.php index a517eb1..580af83 100644 --- a/src/EventSubscriber/StrawberryRunnersEventSavePostProcessingSubscriber.php +++ b/src/EventSubscriber/StrawberryRunnersEventSavePostProcessingSubscriber.php @@ -106,7 +106,7 @@ class StrawberryRunnersEventSavePostProcessingSubscriber extends Strawberryfield protected $account; /** - * StrawberryRunnersEventPreSavePostProcessingSubscriber constructor. + * StrawberryRunnersEventSavePostProcessingSubscriber constructor. * * @param \Drupal\Core\StringTranslation\TranslationInterface $string_translation * @param \Drupal\Core\Messenger\MessengerInterface $messenger From 4c54c940a89cff98df809f1a40149afcc4d995ed Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 7 Dec 2020 08:56:20 -0500 Subject: [PATCH 23/24] Drupal 9 in the .info @giancarlobi as requested --- composer.json | 3 ++- strawberry_runners.info.yml | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/composer.json b/composer.json index b9a9b0a..3603f4e 100644 --- a/composer.json +++ b/composer.json @@ -1,6 +1,6 @@ { "name": "strawberryfield/strawberry_runners", - "description": "Strawberryfield post processing module for Drupal 8 using PHP React", + "description": "Strawberryfield post processing module for Drupal 8/9 using PHP React", "type": "drupal-module", "license": "GPL-2.0+", "homepage": "https://github.com/esmero/strawberry_runners", @@ -19,6 +19,7 @@ } ], "require": { + "drupal/core": "^8.9 || ^9", "ml/json-ld": "^1.0", "drupal/search_api_solr": "~4.1", "mtdowling/jmespath.php": "^2.4", diff --git a/strawberry_runners.info.yml b/strawberry_runners.info.yml index 929aaaa..c416fc1 100644 --- a/strawberry_runners.info.yml +++ b/strawberry_runners.info.yml @@ -2,9 +2,9 @@ name: Strawberry Runners description: Provides Post processing integration based on Strawberryfield JSON data. package: Archipelago type: module -core: 8.x +core_version_requirement: ^8.9 || ^9 php: 7.1 dependencies: - - 'drupal:system (>= 8.8)' - - 'drupal:user' + - 'drupal:views' - 'strawberryfield' + - 'search_api_solr' From 91ccc23c25a2ae1ce9dd49d92077f7bdbe825b98 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 15 Dec 2020 09:35:25 -0500 Subject: [PATCH 24/24] Update hook for missing entity @giancarlobi for you friend --- .../AbstractPostProcessorQueueWorker.php | 7 ++++++- .../OcrPostProcessor.php | 2 +- strawberry_runners.install | 16 +++++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index 3b5b89b..a149081 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -86,8 +86,13 @@ abstract class AbstractPostProcessorQueueWorker extends QueueWorkerBase implemen * @param array $configuration * @param string $plugin_id * @param mixed $plugin_definition - * @param \Drupal\Core\Entity\EntityTypeManager $entity_field_manager + * @param \Drupal\Core\Entity\EntityTypeManagerInterface $entity_type_manager * @param \Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginManager $strawberry_runner_processor_plugin_manager + * @param \Drupal\Core\File\FileSystemInterface $file_system + * @param \Drupal\Core\StreamWrapper\StreamWrapperManagerInterface $stream_wrapper_manager + * @param \Drupal\Core\KeyValueStore\KeyValueFactoryInterface $key_value + * @param \Psr\Log\LoggerInterface $logger + * @param \Drupal\search_api\ParseMode\ParseModePluginManager $parse_mode_manager */ public function __construct(array $configuration, $plugin_id, $plugin_definition, EntityTypeManagerInterface $entity_type_manager, StrawberryRunnersPostProcessorPluginManager $strawberry_runner_processor_plugin_manager, FileSystemInterface $file_system, StreamWrapperManagerInterface $stream_wrapper_manager, KeyValueFactoryInterface $key_value, LoggerInterface $logger, ParseModePluginManager $parse_mode_manager) { parent::__construct($configuration, $plugin_id, $plugin_definition); diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index b46f03d..20ad58e 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -341,7 +341,7 @@ protected function hOCRtoMiniOCR($output, $pageid) { // NOTE: floats are in the form of .1 so we need to remove the first 0. if (count($coos)) { $miniocr->startElement("p"); - $miniocr->writeAttribute("xml:id", $pageid); + $miniocr->writeAttribute("xml:id", 'sequence_'.$pageid); $miniocr->writeAttribute("wh", ltrim($pwidth, 0) . " " . ltrim($pheight, 0)); $miniocr->startElement("b"); $page->registerXPathNamespace('ns', 'http://www.w3.org/1999/xhtml'); diff --git a/strawberry_runners.install b/strawberry_runners.install index 2dd7421..de03244 100644 --- a/strawberry_runners.install +++ b/strawberry_runners.install @@ -8,7 +8,7 @@ use Drupal\Core\StringTranslation\TranslatableMarkup; * * @see 'https://www.drupal.org/node/3034742' */ -function strawberry_runners_update_8100() { +function strawberry_runners_update_8001() { \Drupal::entityDefinitionUpdateManager()->installEntityType(new ConfigEntityType([ 'id' => 'strawberry_runners_postprocessor', 'label' => new TranslatableMarkup('Strawberry Runners Post Processor Configuration'), @@ -34,3 +34,17 @@ function strawberry_runners_update_8100() { ], ])); } +/** + * Update 8002 - Create AMI Set entity. + */ +function strawberry_runners_update_8002() { + + if(!db_table_exists('runneritem_entity')) { + \Drupal::entityTypeManager()->clearCachedDefinitions(); + \Drupal::entityDefinitionUpdateManager() + ->installEntityType(\Drupal::entityTypeManager()->getDefinition('runneritem_entity')); + } + else { + return 'runner item entity already exists'; + } +}