diff --git a/CHANGELOG.md b/CHANGELOG.md index 58678c0..91da21b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,8 @@ ### 1.1.17 * no longer needs the me-tools package; * no longer uses the `Exceptionist` class; -* provides its own `BodyParser` and `Entity` classes and no longer uses the ones provided by `php-tools`; +* provides its own `BodyParser` and `Entity` classes and no longer uses the ones provided by `php-tools`. The `BodyParser` + provides its own `urlToAbsolute()` protected method; * ready for `php-tools` 1.8; * added tests for PHP 8.2. diff --git a/src/Utility/BodyParser.php b/src/Utility/BodyParser.php index 0f1909a..d21935f 100644 --- a/src/Utility/BodyParser.php +++ b/src/Utility/BodyParser.php @@ -14,6 +14,7 @@ */ namespace LinkScanner\Utility; +use phpUri; use Symfony\Component\DomCrawler\Crawler; /** @@ -74,6 +75,20 @@ public function __construct($body, string $url) $this->url = $url; } + /** + * Internal method to build an absolute url + * @param string $relative Relative url to join + * @param string $base Base path, on which to construct the absolute url + * @return string + */ + protected function urlToAbsolute(string $relative, string $base): string + { + $base = clean_url($base, false, true); + $base = preg_match('/^(\w+:\/\/.+)\/[^.\/]+\.[^.\/]+$/', $base, $matches) ? $matches[1] : $base; + + return phpUri::parse($base . '/')->join($relative); + } + /** * Extracts links from body * @return array Array of links @@ -90,15 +105,15 @@ public function extractLinks(): array $crawler = new Crawler($this->body); - $links = []; + $extractedLinks = []; foreach (self::TAGS as $tag => $attribute) { foreach ($crawler->filterXPath('//' . $tag)->extract([$attribute]) as $link) { if ($link) { - $links[] = clean_url(url_to_absolute($this->url, $link), true, true); + $extractedLinks[] = clean_url($this->urlToAbsolute($link, $this->url), true, true); } } } - return $this->extractedLinks = array_unique($links); + return $this->extractedLinks = array_unique($extractedLinks); } }