Skip to content

Commit

Permalink
The BodyParser
Browse files Browse the repository at this point in the history
  provides its own `urlToAbsolute()` protected method
  • Loading branch information
mirko-pagliai committed Jan 6, 2024
1 parent 21cb7f5 commit 7c96c05
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
### 1.1.17
* no longer needs the me-tools package;
* no longer uses the `Exceptionist` class;
* provides its own `BodyParser` and `Entity` classes and no longer uses the ones provided by `php-tools`;
* provides its own `BodyParser` and `Entity` classes and no longer uses the ones provided by `php-tools`. The `BodyParser`
provides its own `urlToAbsolute()` protected method;
* ready for `php-tools` 1.8;
* added tests for PHP 8.2.

Expand Down
21 changes: 18 additions & 3 deletions src/Utility/BodyParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
*/
namespace LinkScanner\Utility;

use phpUri;
use Symfony\Component\DomCrawler\Crawler;

/**
Expand Down Expand Up @@ -74,6 +75,20 @@ public function __construct($body, string $url)
$this->url = $url;
}

/**
* Internal method to build an absolute url
* @param string $relative Relative url to join
* @param string $base Base path, on which to construct the absolute url
* @return string
*/
protected function urlToAbsolute(string $relative, string $base): string
{
$base = clean_url($base, false, true);
$base = preg_match('/^(\w+:\/\/.+)\/[^.\/]+\.[^.\/]+$/', $base, $matches) ? $matches[1] : $base;

return phpUri::parse($base . '/')->join($relative);
}

/**
* Extracts links from body
* @return array<string> Array of links
Expand All @@ -90,15 +105,15 @@ public function extractLinks(): array

$crawler = new Crawler($this->body);

$links = [];
$extractedLinks = [];
foreach (self::TAGS as $tag => $attribute) {
foreach ($crawler->filterXPath('//' . $tag)->extract([$attribute]) as $link) {
if ($link) {
$links[] = clean_url(url_to_absolute($this->url, $link), true, true);
$extractedLinks[] = clean_url($this->urlToAbsolute($link, $this->url), true, true);
}
}
}

return $this->extractedLinks = array_unique($links);
return $this->extractedLinks = array_unique($extractedLinks);
}
}

0 comments on commit 7c96c05

Please sign in to comment.