From 1f80f527ac47b454ac7c9502260ff49ab92b61bb Mon Sep 17 00:00:00 2001 From: Matias De lellis Date: Tue, 22 Aug 2023 20:09:40 -0300 Subject: [PATCH] Implement the Chinese Whispers clustering algorithm in native PHP To the happiness of many (Issue #690, #688, #687, #685, #649, #632, extension, but it goes without saying that its use is still highly recommended. You will understand that it is slower, however I must admit that with JIT enabled, it is quite acceptable, and this is the only reason why decided to publish it. It is still experimental, and it works, but it has problems such as it seems not to converge in stable clusters. When I can fix this, it will probably be even slower. --- appinfo/info.xml | 6 +- .../Tasks/CheckRequirementsTask.php | 16 +- .../Tasks/CreateClustersTask.php | 32 +++- lib/Clusterer/ChineseWhispers.php | 155 ++++++++++++++++++ lib/Model/ExternalModel/ExternalModel.php | 4 - 5 files changed, 187 insertions(+), 26 deletions(-) create mode 100644 lib/Clusterer/ChineseWhispers.php diff --git a/appinfo/info.xml b/appinfo/info.xml index 7cffd91a..b59e75bd 100644 --- a/appinfo/info.xml +++ b/appinfo/info.xml @@ -16,7 +16,7 @@ - **🚀 Build your own thing:** FaceRecognition app is just a basic building block. Through FaceRecognition API, you can build your advanced scenarios - automatically add tags to images, connect contacts and persons, share images from specific person… We want to hear your ideas! ]]> - 0.9.20 + 0.9.30 agpl Matias De lellis Branko Kokanovic @@ -34,9 +34,7 @@ https://matiasdelellis.github.io/img/facerecognition/facerecognition-assign-initial-name.jpeg - pdlib - bz2 - + diff --git a/lib/BackgroundJob/Tasks/CheckRequirementsTask.php b/lib/BackgroundJob/Tasks/CheckRequirementsTask.php index cdd048da..9c442b30 100644 --- a/lib/BackgroundJob/Tasks/CheckRequirementsTask.php +++ b/lib/BackgroundJob/Tasks/CheckRequirementsTask.php @@ -90,29 +90,23 @@ public function execute(FaceRecognitionContext $context) { $phpMemory = MemoryLimits::getPhpMemory(); $this->logDebug("PHP Memory Limit: " . ($phpMemory > 0 ? $phpMemory : "Unknown")); + $this->logDebug("Clustering backend: " . (Requirements::pdlibLoaded() ? "pdlib" : "PHP (Not recommended.")); + if ($this->imaginaryHelper->isEnabled()) { - $this->logDebug("Backend of images: Imaginary"); + $this->logDebug("Image Backend: Imaginary"); $version = $this->imaginaryHelper->getVersion(); if ($version) { $this->logDebug("Imaginary version: " . $version); } else { $imaginaryUrl = $this->imaginaryHelper->getUrl(); $error_message = - "An Imaginary service (" . $imaginaryUrl . ") was configured to manage temporary images, but it is inaccessible." . + "An Imaginpary service (" . $imaginaryUrl . ") was configured to manage temporary images, but it is inaccessible." . "Check out the service, or set the 'preview_imaginary_url' key appropriately."; $this->logInfo($error_message); return false; } } else { - $this->logDebug("Backend of images: Imagick"); - } - - if (!Requirements::pdlibLoaded()) { - $error_message = - "The PDlib PHP extension is not loaded. Cannot continue without it." . - "Please read the documentation again about how to install the application: https://github.com/matiasdelellis/facerecognition/wiki/Installation"; - $this->logInfo($error_message); - return false; + $this->logDebug("Image Backend: Imagick"); } if (!Requirements::hasEnoughMemory()) { diff --git a/lib/BackgroundJob/Tasks/CreateClustersTask.php b/lib/BackgroundJob/Tasks/CreateClustersTask.php index 8eee41f1..dc04c14e 100644 --- a/lib/BackgroundJob/Tasks/CreateClustersTask.php +++ b/lib/BackgroundJob/Tasks/CreateClustersTask.php @@ -1,6 +1,6 @@ + * @copyright Copyright (c) 2017-2023 Matias De lellis * @copyright Copyright (c) 2018, Branko Kokanovic * * @author Branko Kokanovic @@ -33,6 +33,9 @@ use OCA\FaceRecognition\Db\PersonMapper; use OCA\FaceRecognition\Helper\Euclidean; +use OCA\FaceRecognition\Helper\Requirements; + +use OCA\FaceRecognition\Clusterer\ChineseWhispers; use OCA\FaceRecognition\Service\SettingsService; /** @@ -282,10 +285,9 @@ private function getNewClusters(array $faces): array { // Clustering parameters $sensitivity = $this->settingsService->getSensitivity(); - // Create edges for chinese whispers - $edges = array(); - - if (version_compare(phpversion('pdlib'), '1.0.2', '>=')) { + if (!Requirements::pdlibLoaded()) { + // Create edges (neighbors) for Chinese Whispers + $edges = array(); $faces_count = count($faces); for ($i = 0; $i < $faces_count; $i++) { $face1 = $faces[$i]; @@ -304,8 +306,14 @@ private function getNewClusters(array $faces): array { } } } + + // Given the edges get the list of labels (found clusters) for each face. + $newChineseClustersByIndex = dlib_chinese_whispers($edges); } else { + // Create edges (neighbors) for Chinese Whispers + $edges = array(); $faces_count = count($faces); + for ($i = 0; $i < $faces_count; $i++) { $face1 = $faces[$i]; if (!isset($face1->descriptor)) { @@ -323,9 +331,20 @@ private function getNewClusters(array $faces): array { } } } + + // The clustering algorithm actually expects ordered lists. + $oedges = []; + ChineseWhispers::convert_unordered_to_ordered($edges, $oedges); + usort($oedges, function($a, $b) { + if ($a[0] === $b[0]) return $a[1] - $b[1]; + return $a[0] - $b[0]; + }); + + // Given the edges get the list of labels (found clusters) for each face. + $newChineseClustersByIndex = []; + ChineseWhispers::predict($oedges, $newChineseClustersByIndex); } - $newChineseClustersByIndex = dlib_chinese_whispers($edges); $newClusters = array(); for ($i = 0, $c = count($newChineseClustersByIndex); $i < $c; $i++) { if (!isset($newClusters[$newChineseClustersByIndex[$i]])) { @@ -333,7 +352,6 @@ private function getNewClusters(array $faces): array { } $newClusters[$newChineseClustersByIndex[$i]][] = $faces[$i]->id; } - return $newClusters; } diff --git a/lib/Clusterer/ChineseWhispers.php b/lib/Clusterer/ChineseWhispers.php new file mode 100644 index 00000000..700b584b --- /dev/null +++ b/lib/Clusterer/ChineseWhispers.php @@ -0,0 +1,155 @@ + + * + * @license AGPL-3.0-or-later + * + * This code is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License, version 3, + * along with this program. If not, see + * + */ + +namespace OCA\FaceRecognition\Clusterer; + + +/** + * This class implements the graph clustering algorithm described in the + * paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its + * Application to Natural Language Processing Problems by Chris Biemann. + * + * In particular, it tries to be a shameless copy of the original dlib + * implementation. + * - https://github.com/davisking/dlib/blob/master/dlib/clustering/chinese_whispers.h + */ +class ChineseWhispers { + + /** + * Cluster the dataset by assigning a label to each sample.from the edges + */ + static public function predict(array &$edges, array &$labels, int $num_iterations = 100) + { + $labels = []; + if (count($edges) == 0) + return 0; + + $neighbors = []; + self::find_neighbor_ranges($edges, $neighbors); + + // Initialize the labels, each node gets a different label. + for ($i = 0; $i < count($neighbors); ++$i) + $labels[$i] = $i; + + for ($iter = 0; $iter < count($neighbors)*$num_iterations; ++$iter) + { + // Pick a random node. + $idx = random_int(0, count($neighbors) - 1); + + // Count how many times each label happens amongst our neighbors. + $labels_to_counts = []; + $end = $neighbors[$idx][1]; + + for ($i = $neighbors[$idx][0]; $i != $end; ++$i) + { + $iLabelFirst = $edges[$i][1]; + $iLabel = $labels[$iLabelFirst]; + if (isset($labels_to_counts[$iLabel])) + $labels_to_counts[$iLabel]++; + else + $labels_to_counts[$iLabel] = 1; + } + + // find the most common label + // std::map::iterator i; + $best_score = PHP_INT_MIN; + $best_label = $labels[$idx]; + foreach ($labels_to_counts as $key => $value) + { + if ($value > $best_score) + { + $best_score = $value; + $best_label = $key; + } + } + + $labels[$idx] = $best_label; + } + + // Remap the labels into a contiguous range. First we find the + // mapping. + $label_remap = []; + for ($i = 0; $i < count($labels); ++$i) + { + $next_id = count($label_remap); + if (!isset($label_remap[$labels[$i]])) + $label_remap[$labels[$i]] = $next_id; + } + // now apply the mapping to all the labels. + for ($i = 0; $i < count($labels); ++$i) + { + $labels[$i] = $label_remap[$labels[$i]]; + } + + return count($label_remap); + } + + static function find_neighbor_ranges (&$edges, &$neighbors) { + // setup neighbors so that [neighbors[i].first, neighbors[i].second) is the range + // within edges that contains all node i's edges. + $num_nodes = self::max_index_plus_one($edges); + for ($i = 0; $i < $num_nodes; ++$i) $neighbors[$i] = [0, 0]; + $cur_node = 0; + $start_idx = 0; + for ($i = 0; $i < count($edges); ++$i) + { + if ($edges[$i][0] != $cur_node) + { + $neighbors[$cur_node] = [$start_idx, $i]; + $start_idx = $i; + $cur_node = $edges[$i][0]; + } + } + if (count($neighbors) !== 0) + $neighbors[$cur_node] = [$start_idx, count($edges)]; + } + + static function max_index_plus_one ($pairs): int { + if (count($pairs) === 0) + { + return 0; + } + else { + $max_idx = 0; + for ($i = 0; $i < count($pairs); ++$i) + { + if ($pairs[$i][0] > $max_idx) + $max_idx = $pairs[$i][0]; + if ($pairs[$i][1] > $max_idx) + $max_idx = $pairs[$i][1]; + } + return $max_idx + 1; + } + } + + static function convert_unordered_to_ordered (&$edges, &$out_edges) + { + $out_edges = []; + for ($i = 0; $i < count($edges); ++$i) + { + $out_edges[] = [$edges[$i][0], $edges[$i][1]]; + if ($edges[$i][0] != $edges[$i][1]) + $out_edges[] = [$edges[$i][1], $edges[$i][0]]; + } + } +} diff --git a/lib/Model/ExternalModel/ExternalModel.php b/lib/Model/ExternalModel/ExternalModel.php index b5c44df7..8b52633e 100644 --- a/lib/Model/ExternalModel/ExternalModel.php +++ b/lib/Model/ExternalModel/ExternalModel.php @@ -87,10 +87,6 @@ public function isInstalled(): bool { } public function meetDependencies(string &$error_message): bool { - if (!extension_loaded('pdlib')) { - $error_message = "The PDlib PHP extension is not loaded."; - return false; - } if (is_null($this->settingsService->getExternalModelUrl())) { $error_message = "You still need to configure the URL of the service running the model."; return false;