diff --git a/classes/robot/crawler.php b/classes/robot/crawler.php
index f051604e..a7b329d5 100644
--- a/classes/robot/crawler.php
+++ b/classes/robot/crawler.php
@@ -306,12 +306,20 @@ public function get_queue_size() {
* @return object|boolean The node record if the resource pointed to by the URL can and should be considered; or `false` if the
* URL is invalid or excluded.
*/
- public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT) {
+ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT,
+ $level = TOOL_CRAWLER_NODE_LEVEL_PARENT) {
global $DB, $CFG;
$url = $this->absolute_url($baseurl, $url);
+ // Strip priority from indirect child nodes. Only parent and direct children
+ // of parent nodes have priority applied to avoid recursively applying priority
+ // to all ancestors of a parent node.
+ if ($level == TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD) {
+ $priority = TOOL_CRAWLER_PRIORITY_DEFAULT;
+ }
+
// Filter out non http protocols like mailto:cqulibrary@cqu.edu.au etc.
$bits = parse_url($url);
if (array_key_exists('scheme', $bits)
@@ -420,6 +428,7 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO
$node->external = self::is_external($url);
$node->needscrawl = time();
$node->priority = $priority;
+ $node->level = $level;
if (isset($courseid)) {
$node->courseid = $courseid;
@@ -438,6 +447,11 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO
$node->priority = $priority;
$needsupdating = true;
}
+ if ($node->level != $level) {
+ // Set the level again, in case this node has been seen again at a different
+ // level, to avoid reprocessing.
+ $node->level = $level;
+ }
if (isset($courseid)) {
$node->courseid = $courseid;
$needsupdating = true;
@@ -901,8 +915,18 @@ private function link_from_node_to_url($from, $url, $text, $idattr) {
global $DB;
+ // Ascertain the correct node level based on parent node level.
+ if (!empty($from->level) && $from->level == TOOL_CRAWLER_NODE_LEVEL_PARENT) {
+ $level = TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD;
+ } else {
+ $level = TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD;
+ }
+
+ $priority = isset($from->priority) ? $from->priority : TOOL_CRAWLER_PRIORITY_DEFAULT;
+ $courseid = isset($from->courseid) ? $from->courseid : null;
+
// Add the node URL to the queue.
- $to = $this->mark_for_crawl($from->url, $url);
+ $to = $this->mark_for_crawl($from->url, $url, $courseid, $priority, $level);
if ($to === false) {
return false;
}
diff --git a/constants.php b/constants.php
index 860819e9..b0649738 100644
--- a/constants.php
+++ b/constants.php
@@ -93,3 +93,12 @@
define('TOOL_CRAWLER_PRIORITY_DEFAULT', 0);
define('TOOL_CRAWLER_PRIORITY_NORMAL', 50);
define('TOOL_CRAWLER_PRIORITY_HIGH', 100);
+
+/**
+ * Node level assigned to each node based on whether it is the parent node, or
+ * a child node discovered within a parent when crawling, or any child of a child
+ * node (or even further removed).
+ */
+define('TOOL_CRAWLER_NODE_LEVEL_PARENT', 2);
+define('TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD', 1);
+define('TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD', 0);
diff --git a/db/install.xml b/db/install.xml
index d2510ded..fb249872 100644
--- a/db/install.xml
+++ b/db/install.xml
@@ -27,6 +27,7 @@
+
diff --git a/db/upgrade.php b/db/upgrade.php
index 3138ffa5..6466e153 100644
--- a/db/upgrade.php
+++ b/db/upgrade.php
@@ -94,5 +94,20 @@ function xmldb_tool_crawler_upgrade($oldversion) {
upgrade_plugin_savepoint(true, 2019100300, 'tool', 'crawler');
}
+ if ($oldversion < 2020012300) {
+
+ // Define field level to be added to tool_crawler_url.
+ $table = new xmldb_table('tool_crawler_url');
+ $field = new xmldb_field('level', XMLDB_TYPE_INTEGER, '1', null, null, null, '2', 'priority');
+
+ // Conditionally launch add field level.
+ if (!$dbman->field_exists($table, $field)) {
+ $dbman->add_field($table, $field);
+ }
+
+ // Crawler savepoint reached.
+ upgrade_plugin_savepoint(true, 2020012300, 'tool', 'crawler');
+ }
+
return true;
}
diff --git a/tests/phpunit/robot_crawler_test.php b/tests/phpunit/robot_crawler_test.php
index 84a2dc46..29f29b30 100644
--- a/tests/phpunit/robot_crawler_test.php
+++ b/tests/phpunit/robot_crawler_test.php
@@ -27,6 +27,7 @@
defined('MOODLE_INTERNAL') || die('Direct access to this script is forbidden');
require_once(__DIR__ . '/../../locallib.php');
+require_once(__DIR__ . '/../../constants.php');
/**
* Unit tests for link crawler robot
@@ -295,6 +296,7 @@ public function test_should_be_excluded() {
$node->contents = $page . $linktoexclude;
$node->url = $url;
$node->id = $insertid;
+ $node->level = TOOL_CRAWLER_NODE_LEVEL_PARENT;
$this->resetAfterTest(true);
@@ -310,6 +312,81 @@ public function test_should_be_excluded() {
self::assertFalse($found);
}
-}
-
+ /**
+ * Priority provider.
+ *
+ * @return array of potential crawler priority codes.
+ */
+ public function priority_provider() {
+ return [
+ ['high' => TOOL_CRAWLER_PRIORITY_HIGH],
+ ['normal' => TOOL_CRAWLER_PRIORITY_NORMAL],
+ ['default' => TOOL_CRAWLER_PRIORITY_DEFAULT]
+ ];
+ }
+ /**
+ * @dataProvider priority_provider
+ *
+ * Test for issue #108 - passing node crawl priority to child nodes when parsing html.
+ */
+ public function test_parse_html_priority_inheritance($parentpriority) {
+ global $CFG, $DB;
+
+ $parentlocalurl = 'course/view.php?id=1§ion=2';
+ $directchildlocalurl = 'mod/book/view.php?id=7';
+ $indirectchildexternalurl = 'http://someexternalsite.net.au';
+
+ // Internal parent node.
+ $node = $this->robot->mark_for_crawl($CFG->wwwroot, $parentlocalurl, 1, $parentpriority);
+ $node->httpcode = 200;
+ $node->mimetype = 'text/html';
+ $node->external = 0;
+ $node->contents = <<
+
+
+
+ Test title
+
+
+ Direct child node
+
+
+HTML;
+ // Parse the parent node, to create the direct child node.
+ $parentnode = $this->robot->parse_html($node, $node->external);
+
+ // Internal node direct child.
+ $url = new moodle_url('/' . $directchildlocalurl);
+ $node = $DB->get_record('tool_crawler_url', array('url' => $url->raw_out()) );
+ $node->url = $CFG->wwwroot.'/'.$directchildlocalurl;
+ $node->httpcode = 200;
+ $node->mimetype = 'text/html';
+ $node->external = 0;
+ $node->contents = <<
+
+
+
+ Test title
+
+
+ Indirect child node
+
+
+HTML;
+ // Parse the direct child, to create the indirect child node.
+ $directchildnode = $this->robot->parse_html($node, $node->external);
+ $indirectchildnode = $DB->get_record('tool_crawler_url', ['url' => $indirectchildexternalurl]);
+
+ // Direct child nodes should inherit priority from parent node (super node).
+ $this->assertEquals($parentnode->priority, $directchildnode->priority);
+ // Indirect child nodes should not inherit priority from parent node (super node).
+ $this->assertGreaterThanOrEqual($indirectchildnode->priority, $parentnode->priority);
+ // Indirect child nodes should not inherit priority from direct child node.
+ $this->assertGreaterThanOrEqual($indirectchildnode->priority, $directchildnode->priority);
+ // Indirect child nodes should not be able to have a high priority.
+ $this->assertLessThan(TOOL_CRAWLER_PRIORITY_HIGH, $indirectchildnode->priority);
+ }
+}
diff --git a/version.php b/version.php
index fc112523..0e6dfe3b 100644
--- a/version.php
+++ b/version.php
@@ -27,8 +27,8 @@
defined('MOODLE_INTERNAL') || die();
-$plugin->version = 2020010600; // The current plugin version (Date: YYYYMMDDXX)
-$plugin->release = 2020010600; // The current plugin version (Date: YYYYMMDDXX)
+$plugin->version = 2020012300; // The current plugin version (Date: YYYYMMDDXX)
+$plugin->release = 2020012300; // The current plugin version (Date: YYYYMMDDXX)
$plugin->requires = 2016021800; // Requires this Moodle version.
$plugin->component = 'tool_crawler'; // To check on upgrade, that module sits in correct place.
$plugin->maturity = MATURITY_STABLE;