Skip to content

Commit

Permalink
Check original URL against depth tree when visited link is a redirect (
Browse files Browse the repository at this point in the history
…#467)

* Check original URL against depth tree when visited link is a redirect

* Fix link to $originalUrl

* Fix link to $originalUrl on linkurlparser

* Add max depth + redirects test

* Move new parameter to end of signature

* Fix method calls

---------

Co-authored-by: David Racovan <[email protected]>
  • Loading branch information
superpenguin612 and David Racovan authored Jul 16, 2024
1 parent b1498b3 commit 34e51dc
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 11 deletions.
6 changes: 3 additions & 3 deletions src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ public function startCrawling(UriInterface|string $baseUrl)
}
}

public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null): ?Node
public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null, ?UriInterface $originalUrl = null): ?Node
{
if (is_null($this->maximumDepth)) {
return new Node((string) $url);
Expand All @@ -457,7 +457,7 @@ public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node

$returnNode = null;

if ($node->getValue() === (string) $parentUrl) {
if ($node->getValue() === (string) $parentUrl || $node->getValue() === (string) $originalUrl) {
$newNode = new Node((string) $url);

$node->addChild($newNode);
Expand All @@ -466,7 +466,7 @@ public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node
}

foreach ($node->getChildren() as $currentNode) {
$returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
$returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode, $originalUrl);

if (! is_null($returnNode)) {
break;
Expand Down
3 changes: 2 additions & 1 deletion src/Handlers/CrawlRequestFulfilled.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ public function __invoke(ResponseInterface $response, $index)
}

$baseUrl = $this->getBaseUrl($response, $crawlUrl);
$originalUrl = $crawlUrl->url;

$this->urlParser->addFromHtml($body, $baseUrl);
$this->urlParser->addFromHtml($body, $baseUrl, $originalUrl);

usleep($this->crawler->getDelayBetweenRequests());
}
Expand Down
6 changes: 3 additions & 3 deletions src/UrlParsers/LinkUrlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ public function __construct(Crawler $crawler)
$this->crawler = $crawler;
}

public function addFromHtml(string $html, UriInterface $foundOnUrl): void
public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void
{
$allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);

collect($allLinks)
->filter(fn (Url $url) => $this->hasCrawlableScheme($url))
->map(fn (Url $url) => $this->normalizeUrl($url))
->filter(function (Url $url) use ($foundOnUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) {
->filter(function (Url $url) use ($foundOnUrl, $originalUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) {
return false;
}

Expand Down
6 changes: 3 additions & 3 deletions src/UrlParsers/SitemapUrlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ public function __construct(Crawler $crawler)
$this->crawler = $crawler;
}

public function addFromHtml(string $html, UriInterface $foundOnUrl): void
public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void
{
$allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);

collect($allLinks)
->filter(fn (Url $url) => $this->hasCrawlableScheme($url))
->map(fn (Url $url) => $this->normalizeUrl($url))
->filter(function (Url $url) use ($foundOnUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) {
->filter(function (Url $url) use ($foundOnUrl, $originalUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) {
return false;
}

Expand Down
2 changes: 1 addition & 1 deletion src/UrlParsers/UrlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ interface UrlParser
{
public function __construct(Crawler $crawler);

public function addFromHtml(string $html, UriInterface $foundOnUrl): void;
public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void;
}
12 changes: 12 additions & 0 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,18 @@ public function shouldCrawl(UriInterface $url): bool
assertCrawledUrlCount(3);
});

it('should handle redirects correctly when max depth is specified', function () {
createCrawler([
RequestOptions::ALLOW_REDIRECTS => [
'track_redirects' => true,
],
])
->setMaximumDepth(5)
->startCrawling('http://localhost:8080/redirect-home/');

expect(['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'])->toBeCrawledOnce();
});

it('respects the requested delay between requests', function () {
$baseUrl = 'http://localhost:8080';

Expand Down
4 changes: 4 additions & 0 deletions tests/server/server.js
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ app.get('/meta-nofollow', function (request, response) {
response.end('<html><head>\n<meta name="robots" content="index, nofollow">\n</head><body><a href="/meta-nofollow-target">no follow it</a></body></html>');
});

app.get('/redirect-home/', function (request, response) {
response.redirect(301, '/');
});

app.get('/dir1/internal-redirect-entry/', function (request, response) {
response.end('<a href="../loop-generator/internal-redirect/trapped/">trapped</a> <a href="../../dir1/internal-redirect/trap/">trap-start</a>');
});
Expand Down

0 comments on commit 34e51dc

Please sign in to comment.