From 0de53bc3f2ec830ac1c46e4b41a7152f872aae36 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 24 May 2018 00:42:44 +0200 Subject: [PATCH 01/28] Replace arrays with lists --- pom.xml | 10 +++--- .../serritor/api/HttpHeadResponse.java | 34 +++++-------------- 2 files changed, 13 insertions(+), 31 deletions(-) diff --git a/pom.xml b/pom.xml index 7467534..85de756 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 com.github.peterbencze serritor - 1.3.1 + 1.3.2 jar Serritor @@ -54,17 +54,17 @@ org.seleniumhq.selenium selenium-java - 3.11.0 + 3.12.0 org.seleniumhq.selenium htmlunit-driver - 2.29.3 + 2.30.0 com.google.guava guava - 24.1-jre + 25.0-jre junit @@ -75,7 +75,7 @@ org.mockito mockito-core - 2.18.0 + 2.18.3 test diff --git a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java index 93f2aed..d1cd6f7 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java @@ -15,9 +15,10 @@ */ package com.github.peterbencze.serritor.api; +import java.util.Arrays; +import java.util.List; import java.util.Locale; import org.apache.http.Header; -import org.apache.http.HeaderIterator; import org.apache.http.HttpResponse; import org.apache.http.ProtocolVersion; import org.apache.http.StatusLine; @@ -48,10 +49,10 @@ public boolean containsHeader(final String name) { /** * Returns all the headers of this response. * - * @return The array of headers + * @return The list of all the headers */ - public Header[] getAllHeaders() { - return response.getAllHeaders(); + public List
getAllHeaders() { + return Arrays.asList(response.getAllHeaders()); } /** @@ -68,10 +69,10 @@ public Header getFirstHeader(final String name) { * Returns all the headers with a specified name of this response. * * @param name The name of the headers - * @return The array of headers + * @return The list of headers with a specified name */ - public Header[] getHeaders(final String name) { - return response.getHeaders(name); + public List
getHeaders(final String name) { + return Arrays.asList(response.getHeaders(name)); } /** @@ -93,25 +94,6 @@ public ProtocolVersion getProtocolVersion() { return response.getProtocolVersion(); } - /** - * Returns an iterator of all the headers. - * - * @return An iterator of all the headers - */ - public HeaderIterator headerIterator() { - return response.headerIterator(); - } - - /** - * Returns an iterator of the headers with a given name. - * - * @param name The name of the headers - * @return An iterator of the headers with a given name - */ - public HeaderIterator headerIterator(final String name) { - return response.headerIterator(name); - } - /** * Obtains the locale of this response. * From 56945858ecf5bf8884976e0cba37e6b0ee430093 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 27 May 2018 11:35:36 +0200 Subject: [PATCH 02/28] Add cookie store update mechanism for the HTTP client --- .../peterbencze/serritor/api/BaseCrawler.java | 53 +++++++++++++++---- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index a35f72e..d004e2a 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -32,13 +32,17 @@ import java.util.List; import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.SerializationUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; import org.apache.http.Header; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpHead; import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.impl.cookie.BasicClientCookie; +import org.openqa.selenium.Cookie; import org.openqa.selenium.JavascriptExecutor; import org.openqa.selenium.TimeoutException; import org.openqa.selenium.WebDriver; @@ -54,19 +58,12 @@ public abstract class BaseCrawler { private final CrawlerConfiguration config; - // Indicates if the crawler is currently running or not private boolean isStopped; - - // Indicates if the crawling should be stopped (used for cancelling the loop in the run method) private boolean stopCrawling; - - // Used for sending HTTP HEAD requests and receiving associate responses + private BasicCookieStore cookieStore; private HttpClient httpClient; - private WebDriver webDriver; - private CrawlFrontier crawlFrontier; - private CrawlDelayMechanism crawlDelayMechanism; protected BaseCrawler(final CrawlerConfiguration config) { @@ -105,7 +102,10 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { Validate.validState(isStopped, "The crawler is already started."); isStopped = false; - httpClient = HttpClientBuilder.create().build(); + cookieStore = new BasicCookieStore(); + httpClient = HttpClientBuilder.create() + .setDefaultCookieStore(cookieStore) + .build(); webDriver = Validate.notNull(driver, "The webdriver cannot be null."); crawlFrontier = frontierToUse; crawlDelayMechanism = createCrawlDelayMechanism(); @@ -263,6 +263,9 @@ private void run() { } else { onResponseTimeout(htmlResponse); } + + // Update the client's cookie store, so it will have the same state as the browser. + updateClientCookieStore(); } else { // URLs that point to non-HTML content should not be opened in the browser @@ -339,6 +342,38 @@ private void performDelay() { } } + /** + * Adds all the browser cookies for the current domain to the HTTP client's + * cookie store, replacing any existing equivalent ones. + */ + private void updateClientCookieStore() { + webDriver.manage() + .getCookies() + .stream() + .map(BaseCrawler::convertBrowserCookie) + .forEach(cookieStore::addCookie); + } + + /** + * Converts a browser cookie to a HTTP client one. + * + * @param browserCookie The browser cookie to be converted + * @return The converted HTTP client cookie + */ + private static BasicClientCookie convertBrowserCookie(final Cookie browserCookie) { + BasicClientCookie clientCookie = new BasicClientCookie(browserCookie.getName(), browserCookie.getValue()); + clientCookie.setDomain(browserCookie.getDomain()); + clientCookie.setPath(browserCookie.getPath()); + clientCookie.setExpiryDate(browserCookie.getExpiry()); + clientCookie.setSecure(browserCookie.isSecure()); + + if (browserCookie.isHttpOnly()) { + clientCookie.setAttribute("httponly", StringUtils.EMPTY); + } + + return clientCookie; + } + /** * Called when the crawler is about to begin its operation. */ From 72486419a229f150b87086f8b185ba0ff18b3ac3 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 27 May 2018 13:21:28 +0200 Subject: [PATCH 03/28] Modify functional interface of validator --- .../serritor/api/helper/UrlFinder.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java index 24ca816..4015764 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -24,7 +24,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import java.util.function.Function; +import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -44,7 +44,7 @@ public final class UrlFinder { private final Set urlPatterns; private final Set locatingMechanisms; private final Set attributes; - private final Function validator; + private final Predicate validator; private UrlFinder(final UrlFinderBuilder builder) { urlPatterns = builder.urlPatterns; @@ -97,7 +97,7 @@ private List findUrlsInAttributeValue(final String attributeValue) { while (urlPatternMatcher.find()) { String foundUrl = urlPatternMatcher.group().trim(); - if (validator.apply(foundUrl)) { + if (validator.test(foundUrl)) { foundUrls.add(foundUrl); } } @@ -110,13 +110,13 @@ public static final class UrlFinderBuilder { private static final Set DEFAULT_LOCATING_MECHANISMS = Sets.newHashSet(By.tagName("a")); private static final Set DEFAULT_ATTRIBUTES = Sets.newHashSet("href"); - private static final Function DEFAULT_VALIDATOR = UrlFinderBuilder::isValidUrl; + private static final Predicate DEFAULT_VALIDATOR = UrlFinderBuilder::isValidUrl; private final Set urlPatterns; private Set locatingMechanisms; private Set attributes; - private Function validator; + private Predicate validator; /** * Constructs a UrlFinderBuilder instance that can be used @@ -195,12 +195,12 @@ public UrlFinderBuilder setAttribute(final String attribute) { } /** - * Sets a function to be used for validating found URLs. + * Sets a predicate to be used for validating found URLs. * - * @param validator The validator function + * @param validator The validator predicate * @return The UrlFinderBuilder instance */ - public UrlFinderBuilder setValidator(final Function validator) { + public UrlFinderBuilder setValidator(final Predicate validator) { Validate.notNull(validator, "The validator function cannot be null."); this.validator = validator; From cf77bb81b8680ae2eca0143ddd88b84e8c6d3a5f Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 27 May 2018 20:41:41 +0200 Subject: [PATCH 04/28] Refactor run method --- .../peterbencze/serritor/api/BaseCrawler.java | 94 +++++++++---------- 1 file changed, 42 insertions(+), 52 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index d004e2a..c31bf81 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -206,75 +206,65 @@ private void run() { CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); URI currentCandidateUrl = currentCandidate.getCandidateUrl(); - String currentRequestUrlAsString = currentCandidateUrl.toString(); - - HttpHeadResponse httpHeadResponse; URI responseUrl = currentCandidateUrl; + HttpClientContext context = HttpClientContext.create(); - try { - HttpClientContext context = HttpClientContext.create(); + // Update the client's cookie store, so it will have the same state as the browser. + updateClientCookieStore(); + try { // Send an HTTP HEAD request to the current URL to determine its availability and content type - httpHeadResponse = getHttpHeadResponse(currentCandidateUrl, context); + HttpHeadResponse httpHeadResponse = getHttpHeadResponse(currentCandidateUrl, context); // If the request has been redirected, get the final URL List redirectLocations = context.getRedirectLocations(); if (redirectLocations != null) { responseUrl = redirectLocations.get(redirectLocations.size() - 1); } - } catch (IOException ex) { - UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), - currentCandidate.getCrawlRequest()) - .setException(ex) - .build(); - - onUnsuccessfulRequest(unsuccessfulRequest); - continue; - } - - // If the request has been redirected, a new crawl request should be created for the redirected URL - if (!responseUrl.toString().equals(currentRequestUrlAsString)) { - CrawlRequest redirectedCrawlRequest = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()).build(); - crawlFrontier.feedRequest(redirectedCrawlRequest, false); - - continue; - } - - // Check if the content of the response is HTML - if (isContentHtml(httpHeadResponse)) { - boolean timedOut = false; - - try { - // Open the URL in the browser - webDriver.get(currentRequestUrlAsString); - } catch (TimeoutException ex) { - timedOut = true; - } - HtmlResponse htmlResponse = new HtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), - currentCandidate.getCrawlRequest()) - .setHttpHeadResponse(httpHeadResponse) - .setWebDriver(webDriver) - .build(); - - // Check if the request has timed out - if (!timedOut) { - onResponseComplete(htmlResponse); + if (!responseUrl.equals(currentCandidateUrl)) { + // If the request has been redirected, a new crawl request should be created for the redirected URL + + CrawlRequest redirectedCrawlRequest = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()).build(); + crawlFrontier.feedRequest(redirectedCrawlRequest, false); + } else if (isContentHtml(httpHeadResponse)) { + boolean isTimedOut = false; + + try { + // Open the URL in the browser + webDriver.get(currentCandidateUrl.toString()); + } catch (TimeoutException exception) { + isTimedOut = true; + } + + HtmlResponse htmlResponse = new HtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), + currentCandidate.getCrawlRequest()) + .setHttpHeadResponse(httpHeadResponse) + .setWebDriver(webDriver) + .build(); + + if (!isTimedOut) { + onResponseComplete(htmlResponse); + } else { + onResponseTimeout(htmlResponse); + } } else { - onResponseTimeout(htmlResponse); - } + // URLs that point to non-HTML content should not be opened in the browser - // Update the client's cookie store, so it will have the same state as the browser. - updateClientCookieStore(); - } else { - // URLs that point to non-HTML content should not be opened in the browser + NonHtmlResponse nonHtmlResponse = new NonHtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), + currentCandidate.getCrawlRequest()) + .setHttpHeadResponse(httpHeadResponse) + .build(); - NonHtmlResponse nonHtmlResponse = new NonHtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), + onNonHtmlResponse(nonHtmlResponse); + } + } catch (IOException exception) { + UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), currentCandidate.getCrawlRequest()) - .setHttpHeadResponse(httpHeadResponse) + .setException(exception) .build(); - onNonHtmlResponse(nonHtmlResponse); + onUnsuccessfulRequest(unsuccessfulRequest); } performDelay(); From 808990285ba7e0bb589344ac6bf24ab53b7c3854 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 27 May 2018 22:43:35 +0200 Subject: [PATCH 05/28] Remove http head response from callback parameters, refact --- .../peterbencze/serritor/api/BaseCrawler.java | 71 +++++------ .../serritor/api/HtmlResponse.java | 48 ++------ .../serritor/api/HttpHeadResponse.java | 114 ------------------ .../serritor/api/NonHtmlResponse.java | 39 ++---- .../serritor/api/UnsuccessfulRequest.java | 34 ++---- .../serritor/internal/CallbackParameter.java | 30 ++--- .../serritor/internal/CrawlCandidate.java | 10 ++ 7 files changed, 77 insertions(+), 269 deletions(-) delete mode 100644 src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index c31bf81..2e12f06 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -16,9 +16,6 @@ package com.github.peterbencze.serritor.api; import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; -import com.github.peterbencze.serritor.api.HtmlResponse.HtmlResponseBuilder; -import com.github.peterbencze.serritor.api.NonHtmlResponse.NonHtmlResponseBuilder; -import com.github.peterbencze.serritor.api.UnsuccessfulRequest.UnsuccessfulRequestBuilder; import com.github.peterbencze.serritor.internal.AdaptiveCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.CrawlCandidate; import com.github.peterbencze.serritor.internal.CrawlDelayMechanism; @@ -202,69 +199,56 @@ private void run() { onBegin(); while (!stopCrawling && crawlFrontier.hasNextCandidate()) { - // Get the next crawl candidate from the queue CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); - - URI currentCandidateUrl = currentCandidate.getCandidateUrl(); - URI responseUrl = currentCandidateUrl; + URI candidateUrl = currentCandidate.getCandidateUrl(); + URI refererUrl = currentCandidate.getRefererUrl(); + int crawlDepth = currentCandidate.getCrawlDepth(); + CrawlRequest crawlRequest = currentCandidate.getCrawlRequest(); + URI responseUrl = candidateUrl; HttpClientContext context = HttpClientContext.create(); + HttpResponse httpHeadResponse = null; + boolean isUnsuccessfulRequest = false; // Update the client's cookie store, so it will have the same state as the browser. updateClientCookieStore(); try { // Send an HTTP HEAD request to the current URL to determine its availability and content type - HttpHeadResponse httpHeadResponse = getHttpHeadResponse(currentCandidateUrl, context); + httpHeadResponse = getHttpHeadResponse(candidateUrl, context); + } catch (IOException exception) { + onUnsuccessfulRequest(new UnsuccessfulRequest(refererUrl, crawlDepth, crawlRequest, exception)); + isUnsuccessfulRequest = true; + } - // If the request has been redirected, get the final URL + if (!isUnsuccessfulRequest) { List redirectLocations = context.getRedirectLocations(); if (redirectLocations != null) { + // If the request has been redirected, get the final URL responseUrl = redirectLocations.get(redirectLocations.size() - 1); } - if (!responseUrl.equals(currentCandidateUrl)) { + if (!responseUrl.equals(candidateUrl)) { // If the request has been redirected, a new crawl request should be created for the redirected URL + + CrawlRequestBuilder builder = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()); + currentCandidate.getMetadata().ifPresent(builder::setMetadata); - CrawlRequest redirectedCrawlRequest = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()).build(); - crawlFrontier.feedRequest(redirectedCrawlRequest, false); + crawlFrontier.feedRequest(builder.build(), false); } else if (isContentHtml(httpHeadResponse)) { - boolean isTimedOut = false; + HtmlResponse response = new HtmlResponse(refererUrl, crawlDepth, crawlRequest, webDriver); try { // Open the URL in the browser - webDriver.get(currentCandidateUrl.toString()); + webDriver.get(candidateUrl.toString()); } catch (TimeoutException exception) { - isTimedOut = true; + onResponseTimeout(response); } - HtmlResponse htmlResponse = new HtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), - currentCandidate.getCrawlRequest()) - .setHttpHeadResponse(httpHeadResponse) - .setWebDriver(webDriver) - .build(); - - if (!isTimedOut) { - onResponseComplete(htmlResponse); - } else { - onResponseTimeout(htmlResponse); - } + onResponseComplete(response); } else { // URLs that point to non-HTML content should not be opened in the browser - - NonHtmlResponse nonHtmlResponse = new NonHtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), - currentCandidate.getCrawlRequest()) - .setHttpHeadResponse(httpHeadResponse) - .build(); - - onNonHtmlResponse(nonHtmlResponse); + onNonHtmlResponse(new NonHtmlResponse(refererUrl, crawlDepth, crawlRequest)); } - } catch (IOException exception) { - UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(), - currentCandidate.getCrawlRequest()) - .setException(exception) - .build(); - - onUnsuccessfulRequest(unsuccessfulRequest); } performDelay(); @@ -279,10 +263,9 @@ private void run() { * @param destinationUrl The URL to crawl * @return The HTTP HEAD response */ - private HttpHeadResponse getHttpHeadResponse(final URI destinationUrl, final HttpClientContext context) throws IOException { + private HttpResponse getHttpHeadResponse(final URI destinationUrl, final HttpClientContext context) throws IOException { HttpHead headRequest = new HttpHead(destinationUrl.toString()); - HttpResponse response = httpClient.execute(headRequest, context); - return new HttpHeadResponse(response); + return httpClient.execute(headRequest, context); } /** @@ -292,7 +275,7 @@ private HttpHeadResponse getHttpHeadResponse(final URI destinationUrl, final Htt * @return true if the content is HTML, false * otherwise */ - private static boolean isContentHtml(final HttpHeadResponse httpHeadResponse) { + private static boolean isContentHtml(final HttpResponse httpHeadResponse) { Header contentTypeHeader = httpHeadResponse.getFirstHeader("Content-Type"); return contentTypeHeader != null && contentTypeHeader.getValue().contains("text/html"); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java index 442d493..563fa83 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java @@ -26,23 +26,20 @@ */ public final class HtmlResponse extends CallbackParameter { - private final HttpHeadResponse httpHeadResponse; private final WebDriver webDriver; - private HtmlResponse(final HtmlResponseBuilder builder) { - super(builder); - - httpHeadResponse = builder.httpHeadResponse; - webDriver = builder.webDriver; - } - /** - * Returns the HTTP HEAD response. - * - * @return The HTTP HEAD response + * Constructs a HtmlResponse instance. + * + * @param refererUrl The referer URL + * @param crawlDepth The current crawl depth + * @param crawlRequest The processed crawl request + * @param webDriver The WebDriver instance */ - public HttpHeadResponse getHttpHeadResponse() { - return httpHeadResponse; + public HtmlResponse(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest, final WebDriver webDriver) { + super(refererUrl, crawlDepth, crawlRequest); + + this.webDriver = webDriver; } /** @@ -53,29 +50,4 @@ public HttpHeadResponse getHttpHeadResponse() { public WebDriver getWebDriver() { return webDriver; } - - public static final class HtmlResponseBuilder extends CallbackParameterBuilder { - - private HttpHeadResponse httpHeadResponse; - private WebDriver webDriver; - - public HtmlResponseBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { - super(refererUrl, crawlDepth, crawlRequest); - } - - public HtmlResponseBuilder setHttpHeadResponse(final HttpHeadResponse httpHeadResponse) { - this.httpHeadResponse = httpHeadResponse; - return this; - } - - public HtmlResponseBuilder setWebDriver(final WebDriver webDriver) { - this.webDriver = webDriver; - return this; - } - - @Override - public HtmlResponse build() { - return new HtmlResponse(this); - } - } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java deleted file mode 100644 index d1cd6f7..0000000 --- a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright 2017 Peter Bencze. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.github.peterbencze.serritor.api; - -import java.util.Arrays; -import java.util.List; -import java.util.Locale; -import org.apache.http.Header; -import org.apache.http.HttpResponse; -import org.apache.http.ProtocolVersion; -import org.apache.http.StatusLine; - -/** - * Represents a response of a HTTP HEAD request. - * - * @author Peter Bencze - */ -public final class HttpHeadResponse { - - private final HttpResponse response; - - public HttpHeadResponse(final HttpResponse response) { - this.response = response; - } - - /** - * Checks if a certain header is present in this message. - * - * @param name The name of the header - * @return true if present, false otherwise - */ - public boolean containsHeader(final String name) { - return response.containsHeader(name); - } - - /** - * Returns all the headers of this response. - * - * @return The list of all the headers - */ - public List
getAllHeaders() { - return Arrays.asList(response.getAllHeaders()); - } - - /** - * Returns the first header with a specified name of this response. - * - * @param name The name of the header - * @return The first header with the specified name - */ - public Header getFirstHeader(final String name) { - return response.getFirstHeader(name); - } - - /** - * Returns all the headers with a specified name of this response. - * - * @param name The name of the headers - * @return The list of headers with a specified name - */ - public List
getHeaders(final String name) { - return Arrays.asList(response.getHeaders(name)); - } - - /** - * Returns the last header with a specified name of this response. - * - * @param name The name of the header - * @return The last header with a specified name - */ - public Header getLastHeader(final String name) { - return response.getLastHeader(name); - } - - /** - * Returns the protocol version this response is compatible with. - * - * @return The compatible protocol version - */ - public ProtocolVersion getProtocolVersion() { - return response.getProtocolVersion(); - } - - /** - * Obtains the locale of this response. - * - * @return The locale of this response - */ - public Locale getLocale() { - return response.getLocale(); - } - - /** - * Obtains the status line of this response. - * - * @return The status line of this response - */ - public StatusLine getStatusLine() { - return response.getStatusLine(); - } -} diff --git a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java index 0d3e6cf..7de2862 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java @@ -28,21 +28,15 @@ */ public final class NonHtmlResponse extends CallbackParameter { - private final HttpHeadResponse httpHeadResponse; - - private NonHtmlResponse(final NonHtmlResponseBuilder builder) { - super(builder); - - httpHeadResponse = builder.httpHeadResponse; - } - /** - * Returns the HTTP HEAD response. - * - * @return The HTTP HEAD response + * Constructs a NonHtmlResponse instance. + * + * @param refererUrl The referer URL + * @param crawlDepth The current crawl depth + * @param crawlRequest The processed crawl request */ - public HttpHeadResponse getHttpHeadResponse() { - return httpHeadResponse; + public NonHtmlResponse(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { + super(refererUrl, crawlDepth, crawlRequest); } /** @@ -54,23 +48,4 @@ public HttpHeadResponse getHttpHeadResponse() { public void downloadFile(final File destination) throws IOException { FileUtils.copyURLToFile(getCrawlRequest().getRequestUrl().toURL(), destination); } - - public static final class NonHtmlResponseBuilder extends CallbackParameterBuilder { - - private HttpHeadResponse httpHeadResponse; - - public NonHtmlResponseBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { - super(refererUrl, crawlDepth, crawlRequest); - } - - public NonHtmlResponseBuilder setHttpHeadResponse(final HttpHeadResponse httpHeadResponse) { - this.httpHeadResponse = httpHeadResponse; - return this; - } - - @Override - public NonHtmlResponse build() { - return new NonHtmlResponse(this); - } - } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java index 7d379d5..97ca2cb 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java @@ -28,10 +28,19 @@ public final class UnsuccessfulRequest extends CallbackParameter { private final IOException exception; - private UnsuccessfulRequest(final UnsuccessfulRequestBuilder builder) { - super(builder); + /** + * Constructs a UnsuccessfulRequest instance. + * + * @param refererUrl The referer URL + * @param crawlDepth The current crawl depth + * @param crawlRequest The processed crawl request + * @param exception The exception that was thrown while trying to fulfill + * the request + */ + public UnsuccessfulRequest(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest, final IOException exception) { + super(refererUrl, crawlDepth, crawlRequest); - exception = builder.exception; + this.exception = exception; } /** @@ -43,23 +52,4 @@ private UnsuccessfulRequest(final UnsuccessfulRequestBuilder builder) { public IOException getException() { return exception; } - - public static final class UnsuccessfulRequestBuilder extends CallbackParameterBuilder { - - private IOException exception; - - public UnsuccessfulRequestBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { - super(refererUrl, crawlDepth, crawlRequest); - } - - public UnsuccessfulRequestBuilder setException(final IOException exception) { - this.exception = exception; - return this; - } - - @Override - public UnsuccessfulRequest build() { - return new UnsuccessfulRequest(this); - } - } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java index cb6ae0b..61f47df 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java @@ -30,10 +30,17 @@ public abstract class CallbackParameter { private final URI refererUrl; private final CrawlRequest crawlRequest; - protected CallbackParameter(final CallbackParameterBuilder builder) { - crawlDepth = builder.crawlDepth; - refererUrl = builder.refererUrl; - crawlRequest = builder.crawlRequest; + /** + * Base constructor for the callback parameters. + * + * @param refererUrl The referer URL + * @param crawlDepth The current crawl depth + * @param crawlRequest The processed crawl request + */ + protected CallbackParameter(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { + this.refererUrl = refererUrl; + this.crawlDepth = crawlDepth; + this.crawlRequest = crawlRequest; } /** @@ -62,19 +69,4 @@ public final int getCrawlDepth() { public final CrawlRequest getCrawlRequest() { return crawlRequest; } - - public static abstract class CallbackParameterBuilder { - - private final URI refererUrl; - private final int crawlDepth; - private final CrawlRequest crawlRequest; - - public CallbackParameterBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { - this.refererUrl = refererUrl; - this.crawlDepth = crawlDepth; - this.crawlRequest = crawlRequest; - } - - public abstract CallbackParameter build(); - } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java index b5041b9..1639570 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java @@ -19,6 +19,7 @@ import com.google.common.net.InternetDomainName; import java.io.Serializable; import java.net.URI; +import java.util.Optional; /** * Represents a candidate for crawling that will be surely processed by the @@ -82,6 +83,15 @@ public int getCrawlDepth() { public int getPriority() { return crawlRequest.getPriority(); } + + /** + * Returns metadata associated with the request. + * + * @return The request's metadata + */ + public Optional getMetadata() { + return crawlRequest.getMetadata(); + } /** * Returns the crawl request from which this candidate was constructed. From 4f615bd9ef51d7385d542e3ef429c3c9d7a5b39a Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 27 May 2018 22:51:22 +0200 Subject: [PATCH 06/28] Add null check for metadata setter --- .../java/com/github/peterbencze/serritor/api/CrawlRequest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 4188a54..8036ac8 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -21,6 +21,7 @@ import java.io.Serializable; import java.net.URI; import java.util.Optional; +import org.apache.commons.lang3.Validate; /** * Represents a crawl request that might be processed by the crawler in the @@ -138,7 +139,7 @@ public CrawlRequestBuilder setPriority(final int priority) { * @return The CrawlRequestBuilder instance */ public CrawlRequestBuilder setMetadata(final Serializable metadata) { - this.metadata = metadata; + this.metadata = Validate.notNull(metadata, "The metadata cannot be null."); return this; } From 5b99e06dc5cf93fd5c4c9d0897a713526c3e6e4c Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Mon, 28 May 2018 23:58:52 +0200 Subject: [PATCH 07/28] Add the possibility of accessing crawl candidate in callback parameters --- .../peterbencze/serritor/api/BaseCrawler.java | 12 ++-- .../{internal => api}/CrawlCandidate.java | 55 ++++++++++++------- .../serritor/api/CrawlRequest.java | 52 +++++++++--------- .../serritor/api/HtmlResponse.java | 19 +++---- .../serritor/api/NonHtmlResponse.java | 24 ++++---- .../serritor/api/UnsuccessfulRequest.java | 15 ++--- .../serritor/internal/CallbackParameter.java | 48 ++++------------ .../serritor/internal/CrawlFrontier.java | 3 +- .../serritor/internal/CrawlFrontierTest.java | 1 + 9 files changed, 103 insertions(+), 126 deletions(-) rename src/main/java/com/github/peterbencze/serritor/{internal => api}/CrawlCandidate.java (66%) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 2e12f06..bb9841b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -17,7 +17,6 @@ import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; import com.github.peterbencze.serritor.internal.AdaptiveCrawlDelayMechanism; -import com.github.peterbencze.serritor.internal.CrawlCandidate; import com.github.peterbencze.serritor.internal.CrawlDelayMechanism; import com.github.peterbencze.serritor.internal.CrawlFrontier; import com.github.peterbencze.serritor.internal.FixedCrawlDelayMechanism; @@ -201,9 +200,6 @@ private void run() { while (!stopCrawling && crawlFrontier.hasNextCandidate()) { CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); URI candidateUrl = currentCandidate.getCandidateUrl(); - URI refererUrl = currentCandidate.getRefererUrl(); - int crawlDepth = currentCandidate.getCrawlDepth(); - CrawlRequest crawlRequest = currentCandidate.getCrawlRequest(); URI responseUrl = candidateUrl; HttpClientContext context = HttpClientContext.create(); HttpResponse httpHeadResponse = null; @@ -216,7 +212,7 @@ private void run() { // Send an HTTP HEAD request to the current URL to determine its availability and content type httpHeadResponse = getHttpHeadResponse(candidateUrl, context); } catch (IOException exception) { - onUnsuccessfulRequest(new UnsuccessfulRequest(refererUrl, crawlDepth, crawlRequest, exception)); + onUnsuccessfulRequest(new UnsuccessfulRequest(currentCandidate, exception)); isUnsuccessfulRequest = true; } @@ -232,10 +228,10 @@ private void run() { CrawlRequestBuilder builder = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()); currentCandidate.getMetadata().ifPresent(builder::setMetadata); - + crawlFrontier.feedRequest(builder.build(), false); } else if (isContentHtml(httpHeadResponse)) { - HtmlResponse response = new HtmlResponse(refererUrl, crawlDepth, crawlRequest, webDriver); + HtmlResponse response = new HtmlResponse(currentCandidate, webDriver); try { // Open the URL in the browser @@ -247,7 +243,7 @@ private void run() { onResponseComplete(response); } else { // URLs that point to non-HTML content should not be opened in the browser - onNonHtmlResponse(new NonHtmlResponse(refererUrl, crawlDepth, crawlRequest)); + onNonHtmlResponse(new NonHtmlResponse(currentCandidate)); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java similarity index 66% rename from src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java rename to src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java index 1639570..43571c8 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java @@ -13,17 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.github.peterbencze.serritor.internal; +package com.github.peterbencze.serritor.api; -import com.github.peterbencze.serritor.api.CrawlRequest; import com.google.common.net.InternetDomainName; import java.io.Serializable; import java.net.URI; import java.util.Optional; /** - * Represents a candidate for crawling that will be surely processed by the - * crawler. + * Represents a candidate to be crawled by the crawler. * * @author Peter Bencze */ @@ -33,14 +31,14 @@ public final class CrawlCandidate implements Serializable { private final int crawlDepth; private final CrawlRequest crawlRequest; - public CrawlCandidate(final CrawlCandidateBuilder builder) { + private CrawlCandidate(final CrawlCandidateBuilder builder) { this.crawlRequest = builder.crawlRequest; this.refererUrl = builder.refererUrl; this.crawlDepth = builder.crawlDepth; } /** - * Returns the referer's URL. + * Returns the referer URL. * * @return The URL of the referer */ @@ -49,7 +47,7 @@ public URI getRefererUrl() { } /** - * Returns the candidate's URL. + * Returns the candidate URL. * * @return The URL of the candidate */ @@ -58,7 +56,7 @@ public URI getCandidateUrl() { } /** - * Returns the domain of the candidate's URL. + * Returns the domain of the candidate URL. * * @return The domain of the candidate URL */ @@ -69,7 +67,7 @@ public InternetDomainName getDomain() { /** * Returns the crawl depth of the candidate. * - * @return The crawl depth + * @return The crawl depth of the candidate */ public int getCrawlDepth() { return crawlDepth; @@ -78,30 +76,24 @@ public int getCrawlDepth() { /** * Returns the priority of the candidate. * - * @return The priority + * @return The priority of the candidate */ public int getPriority() { return crawlRequest.getPriority(); } - + /** - * Returns metadata associated with the request. + * Returns the metadata associated with the candidate. * - * @return The request's metadata + * @return The metadata associated with the candidate */ public Optional getMetadata() { return crawlRequest.getMetadata(); } /** - * Returns the crawl request from which this candidate was constructed. - * - * @return The CrawlRequest instance + * Builds crawl candidates to be crawled by the crawler. */ - public CrawlRequest getCrawlRequest() { - return crawlRequest; - } - public static final class CrawlCandidateBuilder { private final CrawlRequest crawlRequest; @@ -109,20 +101,43 @@ public static final class CrawlCandidateBuilder { private URI refererUrl; private int crawlDepth; + /** + * Creates a {@link CrawlCandidateBuilder} instance. + * + * @param request The {@link CrawlRequest} instance from which this + * candidate is built + */ public CrawlCandidateBuilder(final CrawlRequest request) { crawlRequest = request; } + /** + * Sets the referer URL. + * + * @param refererUrl The referer URL + * @return The {@link CrawlCandidateBuilder} instance + */ public CrawlCandidateBuilder setRefererUrl(final URI refererUrl) { this.refererUrl = refererUrl; return this; } + /** + * Sets the crawl depth of the candidate. + * + * @param crawlDepth The crawl depth of the candidate + * @return The {@link CrawlCandidateBuilder} instance + */ public CrawlCandidateBuilder setCrawlDepth(final int crawlDepth) { this.crawlDepth = crawlDepth; return this; } + /** + * Builds the configured {@link CrawlCandidate} instance. + * + * @return The configured {@link CrawlCandidate} instance + */ public CrawlCandidate build() { return new CrawlCandidate(this); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 8036ac8..3f0f7a7 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -24,9 +24,8 @@ import org.apache.commons.lang3.Validate; /** - * Represents a crawl request that might be processed by the crawler in the - * future. The reason why it is not sure that it will be processed is because it - * might get filtered out by one of the enabled filters. + * Represents a crawl request that may be completed by the crawler. If request + * filtering is enabled, it could get filtered out. * * @author Peter Bencze */ @@ -35,7 +34,7 @@ public final class CrawlRequest implements Serializable { private final URI requestUrl; private final int priority; private final Serializable metadata; - + private transient InternetDomainName domain; private CrawlRequest(final CrawlRequestBuilder builder) { @@ -46,16 +45,16 @@ private CrawlRequest(final CrawlRequestBuilder builder) { } /** - * Returns the request's URL. + * Returns the request URL. * - * @return The URL of the request + * @return The request URL */ public URI getRequestUrl() { return requestUrl; } /** - * Returns the domain of the request's URL. + * Returns the domain of the request URL. * * @return The domain of the request URL */ @@ -64,7 +63,7 @@ public InternetDomainName getDomain() { } /** - * Returns the request's priority. + * Returns the priority of the request. * * @return The priority of the request */ @@ -73,30 +72,31 @@ public int getPriority() { } /** - * Returns metadata associated with the request. + * Returns the metadata associated with the request. * - * @return The request's metadata + * @return The metadata associated with the request */ public Optional getMetadata() { return Optional.ofNullable(metadata); } + /** + * Builds crawl requests which can be fed to the crawler. + */ public static final class CrawlRequestBuilder { private static final int DEFAULT_PRIORITY = 0; private final URI requestUrl; private final InternetDomainName domain; - + private int priority; private Serializable metadata; /** - * Constructs a CrawlRequestBuilder instance that can be - * used to create CrawRequest instances. + * Creates a {@link CrawlRequestBuilder} instance. * - * @param requestUrl The request's URL given as a URL - * instance + * @param requestUrl The request URL */ public CrawlRequestBuilder(final URI requestUrl) { this.requestUrl = requestUrl; @@ -109,22 +109,20 @@ public CrawlRequestBuilder(final URI requestUrl) { } /** - * Constructs a CrawlRequestBuilder instance that can be - * used to create CrawRequest instances. + * Creates a {@link CrawlRequestBuilder} instance. * - * @param requestUrl The request's URL given as a String - * instance + * @param requestUrl The request URL */ public CrawlRequestBuilder(final String requestUrl) { this(URI.create(requestUrl)); } /** - * Sets the request's priority. + * Sets the priority of the request. * * @param priority The priority of the request (higher number means * higher priority) - * @return The CrawlRequestBuilder instance + * @return The {@link CrawlRequestBuilder} instance */ public CrawlRequestBuilder setPriority(final int priority) { this.priority = priority; @@ -132,11 +130,11 @@ public CrawlRequestBuilder setPriority(final int priority) { } /** - * Sets additional metadata for the request which can be later accessed - * when the crawler processed the request. + * Sets the metadata of the request which can be later accessed when the + * crawler completed the request. * * @param metadata The metadata associated with the request - * @return The CrawlRequestBuilder instance + * @return The {@link CrawlRequestBuilder} instance */ public CrawlRequestBuilder setMetadata(final Serializable metadata) { this.metadata = Validate.notNull(metadata, "The metadata cannot be null."); @@ -144,9 +142,9 @@ public CrawlRequestBuilder setMetadata(final Serializable metadata) { } /** - * Builds the configured CrawlRequest instance. + * Builds the configured {@link CrawlRequest} instance. * - * @return The configured CrawlRequest instance + * @return The configured {@link CrawlRequest} instance */ public CrawlRequest build() { return new CrawlRequest(this); @@ -155,7 +153,7 @@ public CrawlRequest build() { private void readObject(final ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); - + domain = InternetDomainName.from(requestUrl.getHost()); } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java index 563fa83..20d4a6e 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java @@ -16,7 +16,6 @@ package com.github.peterbencze.serritor.api; import com.github.peterbencze.serritor.internal.CallbackParameter; -import java.net.URI; import org.openqa.selenium.WebDriver; /** @@ -29,23 +28,21 @@ public final class HtmlResponse extends CallbackParameter { private final WebDriver webDriver; /** - * Constructs a HtmlResponse instance. - * - * @param refererUrl The referer URL - * @param crawlDepth The current crawl depth - * @param crawlRequest The processed crawl request - * @param webDriver The WebDriver instance + * Creates an {@link HtmlResponse} instance. + * + * @param crawlCandidate The crawled {@link CrawlCandidate} instance + * @param webDriver The {@link WebDriver} instance */ - public HtmlResponse(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest, final WebDriver webDriver) { - super(refererUrl, crawlDepth, crawlRequest); + public HtmlResponse(final CrawlCandidate crawlCandidate, final WebDriver webDriver) { + super(crawlCandidate); this.webDriver = webDriver; } /** - * Returns the WebDriver instance for the browser. + * Returns the {@link WebDriver} instance of the browser. * - * @return The WebDriver instance + * @return The {@link WebDriver} instance of the browser */ public WebDriver getWebDriver() { return webDriver; diff --git a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java index 7de2862..e2cbedb 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java @@ -18,7 +18,6 @@ import com.github.peterbencze.serritor.internal.CallbackParameter; import java.io.File; import java.io.IOException; -import java.net.URI; import org.apache.commons.io.FileUtils; /** @@ -29,23 +28,22 @@ public final class NonHtmlResponse extends CallbackParameter { /** - * Constructs a NonHtmlResponse instance. - * - * @param refererUrl The referer URL - * @param crawlDepth The current crawl depth - * @param crawlRequest The processed crawl request + * Creates a {@link NonHtmlResponse} instance. + * + * @param crawlCandidate The crawled {@link CrawlCandidate} instance */ - public NonHtmlResponse(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { - super(refererUrl, crawlDepth, crawlRequest); + public NonHtmlResponse(final CrawlCandidate crawlCandidate) { + super(crawlCandidate); } - + /** * Downloads the file specified by the request URL. - * - * @param destination The destination File instance - * @throws IOException If the URL cannot be opened or I/O error occurs while downloading the file + * + * @param destination The destination {@link File} instance + * @throws IOException If the URL cannot be opened or I/O error occurs while + * downloading the file */ public void downloadFile(final File destination) throws IOException { - FileUtils.copyURLToFile(getCrawlRequest().getRequestUrl().toURL(), destination); + FileUtils.copyURLToFile(getCrawlCandidate().getCandidateUrl().toURL(), destination); } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java index 97ca2cb..a55c970 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java @@ -17,7 +17,6 @@ import com.github.peterbencze.serritor.internal.CallbackParameter; import java.io.IOException; -import java.net.URI; /** * Represents an unsuccessful request. @@ -29,25 +28,23 @@ public final class UnsuccessfulRequest extends CallbackParameter { private final IOException exception; /** - * Constructs a UnsuccessfulRequest instance. + * Creates an {@link UnsuccessfulRequest} instance. * - * @param refererUrl The referer URL - * @param crawlDepth The current crawl depth - * @param crawlRequest The processed crawl request + * @param crawlCandidate The crawled {@link CrawlCandidate} instance * @param exception The exception that was thrown while trying to fulfill * the request */ - public UnsuccessfulRequest(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest, final IOException exception) { - super(refererUrl, crawlDepth, crawlRequest); + public UnsuccessfulRequest(final CrawlCandidate crawlCandidate, final IOException exception) { + super(crawlCandidate); this.exception = exception; } /** - * Returns the exception that was thrown while trying to fulfill the + * Returns the exception which was thrown while trying to fulfill the * request. * - * @return The IOException instance + * @return The thrown {@link IOException} instance */ public IOException getException() { return exception; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java index 61f47df..881e3d6 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java @@ -15,58 +15,32 @@ */ package com.github.peterbencze.serritor.internal; -import com.github.peterbencze.serritor.api.CrawlRequest; -import java.net.URI; -import java.util.Optional; +import com.github.peterbencze.serritor.api.CrawlCandidate; /** - * The base class from which all callback parameters inherit from. + * Base class from which all callback parameters inherit from. * * @author Peter Bencze */ public abstract class CallbackParameter { - private final int crawlDepth; - private final URI refererUrl; - private final CrawlRequest crawlRequest; + private final CrawlCandidate crawlCandidate; /** - * Base constructor for the callback parameters. - * - * @param refererUrl The referer URL - * @param crawlDepth The current crawl depth - * @param crawlRequest The processed crawl request - */ - protected CallbackParameter(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) { - this.refererUrl = refererUrl; - this.crawlDepth = crawlDepth; - this.crawlRequest = crawlRequest; - } - - /** - * Returns the referer URL. - * - * @return The referer URL - */ - public final Optional getRefererUrl() { - return Optional.ofNullable(refererUrl); - } - - /** - * Returns the current crawl depth. + * Base constructor of callback parameters. * - * @return The current crawl depth + * @param crawlCandidate The crawled {@link CrawlCandidate} instance */ - public final int getCrawlDepth() { - return crawlDepth; + protected CallbackParameter(final CrawlCandidate crawlCandidate) { + this.crawlCandidate = crawlCandidate; } /** - * Returns the crawl request that was processed by the crawler. + * Returns the crawl candidate which was crawled by the crawler. * - * @return The processed CrawlRequest instance + * @return The crawled {@link CrawlCandidate} instance */ - public final CrawlRequest getCrawlRequest() { - return crawlRequest; + public final CrawlCandidate getCrawlCandidate() { + return crawlCandidate; } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index bdcf569..c38cd3e 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -15,9 +15,10 @@ */ package com.github.peterbencze.serritor.internal; +import com.github.peterbencze.serritor.api.CrawlCandidate; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import com.github.peterbencze.serritor.api.CrawlRequest; -import com.github.peterbencze.serritor.internal.CrawlCandidate.CrawlCandidateBuilder; +import com.github.peterbencze.serritor.api.CrawlCandidate.CrawlCandidateBuilder; import java.io.Serializable; import java.net.URI; import java.util.Arrays; diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 79c5131..6e38a26 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -15,6 +15,7 @@ */ package com.github.peterbencze.serritor.internal; +import com.github.peterbencze.serritor.api.CrawlCandidate; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; From b88b15870d0dc9d54a3977c6f8a587ab486d41fe Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 3 Jun 2018 18:31:55 +0200 Subject: [PATCH 08/28] Modify event handling --- .../peterbencze/serritor/api/BaseCrawler.java | 76 ++++++++++++------- .../NonHtmlContentEvent.java} | 21 ++--- .../PageLoadEvent.java} | 21 ++--- .../api/event/PageLoadTimeoutEvent.java | 52 +++++++++++++ .../RequestErrorEvent.java} | 23 +++--- .../api/event/RequestRedirectEvent.java | 51 +++++++++++++ .../serritor/api/helper/UrlFinder.java | 12 +-- ...allbackParameter.java => EventObject.java} | 14 ++-- .../serritor/api/helper/UrlFinderTest.java | 12 +-- 9 files changed, 202 insertions(+), 80 deletions(-) rename src/main/java/com/github/peterbencze/serritor/api/{NonHtmlResponse.java => event/NonHtmlContentEvent.java} (59%) rename src/main/java/com/github/peterbencze/serritor/api/{HtmlResponse.java => event/PageLoadEvent.java} (55%) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java rename src/main/java/com/github/peterbencze/serritor/api/{UnsuccessfulRequest.java => event/RequestErrorEvent.java} (54%) create mode 100644 src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java rename src/main/java/com/github/peterbencze/serritor/internal/{CallbackParameter.java => EventObject.java} (67%) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index bb9841b..09c88ce 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -16,6 +16,11 @@ package com.github.peterbencze.serritor.api; import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; +import com.github.peterbencze.serritor.api.event.NonHtmlContentEvent; +import com.github.peterbencze.serritor.api.event.PageLoadEvent; +import com.github.peterbencze.serritor.api.event.PageLoadTimeoutEvent; +import com.github.peterbencze.serritor.api.event.RequestErrorEvent; +import com.github.peterbencze.serritor.api.event.RequestRedirectEvent; import com.github.peterbencze.serritor.internal.AdaptiveCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.CrawlDelayMechanism; import com.github.peterbencze.serritor.internal.CrawlFrontier; @@ -195,7 +200,7 @@ protected final void crawl(final List requests) { * Defines the workflow of the crawler. */ private void run() { - onBegin(); + onStart(); while (!stopCrawling && crawlFrontier.hasNextCandidate()) { CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); @@ -212,7 +217,7 @@ private void run() { // Send an HTTP HEAD request to the current URL to determine its availability and content type httpHeadResponse = getHttpHeadResponse(candidateUrl, context); } catch (IOException exception) { - onUnsuccessfulRequest(new UnsuccessfulRequest(currentCandidate, exception)); + onRequestError(new RequestErrorEvent(currentCandidate, exception)); isUnsuccessfulRequest = true; } @@ -225,32 +230,39 @@ private void run() { if (!responseUrl.equals(candidateUrl)) { // If the request has been redirected, a new crawl request should be created for the redirected URL - CrawlRequestBuilder builder = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()); currentCandidate.getMetadata().ifPresent(builder::setMetadata); + CrawlRequest redirectedRequest = builder.build(); - crawlFrontier.feedRequest(builder.build(), false); + crawlFrontier.feedRequest(redirectedRequest, false); + onRequestRedirect(new RequestRedirectEvent(currentCandidate, redirectedRequest)); } else if (isContentHtml(httpHeadResponse)) { - HtmlResponse response = new HtmlResponse(currentCandidate, webDriver); + boolean isTimedOut = false; + TimeoutException exception = null; try { // Open the URL in the browser webDriver.get(candidateUrl.toString()); - } catch (TimeoutException exception) { - onResponseTimeout(response); + } catch (TimeoutException exc) { + isTimedOut = true; + exception = exc; } - onResponseComplete(response); + if (!isTimedOut) { + onPageLoad(new PageLoadEvent(currentCandidate, webDriver)); + } else { + onPageLoadTimeout(new PageLoadTimeoutEvent(currentCandidate, exception)); + } } else { // URLs that point to non-HTML content should not be opened in the browser - onNonHtmlResponse(new NonHtmlResponse(currentCandidate)); + onNonHtmlContent(new NonHtmlContentEvent(currentCandidate)); } } performDelay(); } - onFinish(); + onStop(); } /** @@ -344,49 +356,55 @@ private static BasicClientCookie convertBrowserCookie(final Cookie browserCookie } /** - * Called when the crawler is about to begin its operation. + * Callback which gets called when the crawler is started. + */ + protected void onStart() { + } + + /** + * Callback which gets called when the browser loads the page. + * + * @param event The {@link PageLoadEvent} instance */ - protected void onBegin() { + protected void onPageLoad(final PageLoadEvent event) { } /** - * Called after the browser loads the given URL. + * Callback which gets called when the content type is not HTML. * - * @param response The HTML response + * @param event The {@link NonHtmlContentEvent} instance */ - protected void onResponseComplete(final HtmlResponse response) { + protected void onNonHtmlContent(final NonHtmlContentEvent event) { } /** - * Called when the loading of the given URL times out in the browser. Use - * this callback with caution: the page might be half-loaded or not loaded - * at all. + * Callback which gets called when a request error occurs. * - * @param response The HTML response + * @param event The {@link RequestErrorEvent} instance */ - protected void onResponseTimeout(final HtmlResponse response) { + protected void onRequestError(final RequestErrorEvent event) { } /** - * Called when getting a non-HTML response. + * Callback which gets called when a request is redirected. * - * @param response The non-HTML response + * @param event The {@link RequestRedirectEvent} instance */ - protected void onNonHtmlResponse(final NonHtmlResponse response) { + protected void onRequestRedirect(final RequestRedirectEvent event) { } /** - * Called when an exception occurs while sending an initial HEAD request to - * the given URL. + * Callback which gets called when the page does not load in the browser + * within the timeout period. * - * @param request The unsuccessful request + * @param event The {@link PageLoadTimeoutEvent} instance */ - protected void onUnsuccessfulRequest(final UnsuccessfulRequest request) { + protected void onPageLoadTimeout(final PageLoadTimeoutEvent event) { } /** - * Called when the crawler successfully finishes its operation. + * Callback which gets called when the crawler is stopped. */ - protected void onFinish() { + protected void onStop() { } } diff --git a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java similarity index 59% rename from src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java rename to src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java index e2cbedb..7fc4670 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java @@ -13,34 +13,35 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.github.peterbencze.serritor.api; +package com.github.peterbencze.serritor.api.event; -import com.github.peterbencze.serritor.internal.CallbackParameter; +import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.internal.EventObject; import java.io.File; import java.io.IOException; import org.apache.commons.io.FileUtils; /** - * Represents a non-HTML response. + * Event which gets delivered when the content type is not HTML. * * @author Peter Bencze */ -public final class NonHtmlResponse extends CallbackParameter { +public final class NonHtmlContentEvent extends EventObject { /** - * Creates a {@link NonHtmlResponse} instance. + * Creates a {@link NonHtmlContentEvent} instance. * - * @param crawlCandidate The crawled {@link CrawlCandidate} instance + * @param crawlCandidate the current crawl candidate */ - public NonHtmlResponse(final CrawlCandidate crawlCandidate) { + public NonHtmlContentEvent(final CrawlCandidate crawlCandidate) { super(crawlCandidate); } /** - * Downloads the file specified by the request URL. + * Downloads the file specified by the URL. * - * @param destination The destination {@link File} instance - * @throws IOException If the URL cannot be opened or I/O error occurs while + * @param destination the destination file + * @throws IOException if the URL cannot be opened or I/O error occurs while * downloading the file */ public void downloadFile(final File destination) throws IOException { diff --git a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java similarity index 55% rename from src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java rename to src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java index 20d4a6e..cd3726b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java @@ -13,36 +13,37 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.github.peterbencze.serritor.api; +package com.github.peterbencze.serritor.api.event; -import com.github.peterbencze.serritor.internal.CallbackParameter; +import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.internal.EventObject; import org.openqa.selenium.WebDriver; /** - * Represents an HTML response. + * Event which gets delivered when the browser loads the page. * * @author Peter Bencze */ -public final class HtmlResponse extends CallbackParameter { +public final class PageLoadEvent extends EventObject { private final WebDriver webDriver; /** - * Creates an {@link HtmlResponse} instance. + * Creates a {@link PageLoadEvent} instance. * - * @param crawlCandidate The crawled {@link CrawlCandidate} instance - * @param webDriver The {@link WebDriver} instance + * @param crawlCandidate the current crawl candidate + * @param webDriver the WebDriver to control the browser */ - public HtmlResponse(final CrawlCandidate crawlCandidate, final WebDriver webDriver) { + public PageLoadEvent(final CrawlCandidate crawlCandidate, final WebDriver webDriver) { super(crawlCandidate); this.webDriver = webDriver; } /** - * Returns the {@link WebDriver} instance of the browser. + * Returns the WebDriver to control the browser. * - * @return The {@link WebDriver} instance of the browser + * @return the WebDriver to control the browser */ public WebDriver getWebDriver() { return webDriver; diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java new file mode 100644 index 0000000..a2b88b8 --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java @@ -0,0 +1,52 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.api.event; + +import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.internal.EventObject; +import org.openqa.selenium.TimeoutException; + +/** + * Event which gets delivered when a page does not load in the browser within + * the timeout period. + * + * @author Peter Bencze + */ +public final class PageLoadTimeoutEvent extends EventObject { + + private final TimeoutException exception; + + /** + * Creates a {@link PageLoadTimeoutEvent} instance. + * + * @param crawlCandidate the current crawl candidate + * @param exception the thrown exception + */ + public PageLoadTimeoutEvent(final CrawlCandidate crawlCandidate, final TimeoutException exception) { + super(crawlCandidate); + + this.exception = exception; + } + + /** + * Returns the thrown exception. + * + * @return the thrown exception + */ + public TimeoutException getException() { + return exception; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java similarity index 54% rename from src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java rename to src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java index a55c970..305840b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java @@ -13,38 +13,37 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.github.peterbencze.serritor.api; +package com.github.peterbencze.serritor.api.event; -import com.github.peterbencze.serritor.internal.CallbackParameter; +import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.internal.EventObject; import java.io.IOException; /** - * Represents an unsuccessful request. + * Event which gets delivered when a request error occurs. * * @author Peter Bencze */ -public final class UnsuccessfulRequest extends CallbackParameter { +public final class RequestErrorEvent extends EventObject { private final IOException exception; /** - * Creates an {@link UnsuccessfulRequest} instance. + * Creates a {@link RequestErrorEvent} instance. * - * @param crawlCandidate The crawled {@link CrawlCandidate} instance - * @param exception The exception that was thrown while trying to fulfill - * the request + * @param crawlCandidate the current crawl candidate + * @param exception the thrown exception */ - public UnsuccessfulRequest(final CrawlCandidate crawlCandidate, final IOException exception) { + public RequestErrorEvent(final CrawlCandidate crawlCandidate, final IOException exception) { super(crawlCandidate); this.exception = exception; } /** - * Returns the exception which was thrown while trying to fulfill the - * request. + * Returns the thrown exception. * - * @return The thrown {@link IOException} instance + * @return the thrown exception */ public IOException getException() { return exception; diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java new file mode 100644 index 0000000..d142d0a --- /dev/null +++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java @@ -0,0 +1,51 @@ +/* + * Copyright 2018 Peter Bencze. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.peterbencze.serritor.api.event; + +import com.github.peterbencze.serritor.api.CrawlCandidate; +import com.github.peterbencze.serritor.api.CrawlRequest; +import com.github.peterbencze.serritor.internal.EventObject; + +/** + * Event which gets delivered when a request is redirected. + * + * @author Peter Bencze + */ +public final class RequestRedirectEvent extends EventObject { + + private final CrawlRequest redirectedCrawlRequest; + + /** + * Creates a {@link RequestRedirectEvent} instance. + * + * @param crawlCandidate the current crawl candidate + * @param redirectedCrawlRequest the crawl request for the redirected URL + */ + public RequestRedirectEvent(final CrawlCandidate crawlCandidate, final CrawlRequest redirectedCrawlRequest) { + super(crawlCandidate); + + this.redirectedCrawlRequest = redirectedCrawlRequest; + } + + /** + * Returns the crawl request for the redirected URL. + * + * @return the crawl request for the redirected URL + */ + public CrawlRequest getRedirectedCrawlRequest() { + return redirectedCrawlRequest; + } +} diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java index 4015764..8347654 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -15,7 +15,7 @@ */ package com.github.peterbencze.serritor.api.helper; -import com.github.peterbencze.serritor.api.HtmlResponse; +import com.github.peterbencze.serritor.api.event.PageLoadEvent; import com.google.common.collect.Sets; import com.google.common.net.InternetDomainName; import java.net.URI; @@ -54,17 +54,17 @@ private UrlFinder(final UrlFinderBuilder builder) { } /** - * Returns a list of validated URLs found in the response's HTML source. + * Returns a list of validated URLs found in the page's HTML source. * - * @param response The HtmlResponse instance - * @return The list of found URLs + * @param event the {@link PageLoadEvent} instance + * @return the list of found URLs in the page's HTML source */ - public List findUrlsInResponse(final HtmlResponse response) { + public List findUrlsInPage(final PageLoadEvent event) { Set foundUrls = new HashSet<>(); // Find elements using the specified locating mechanisms Set extractedElements = locatingMechanisms.stream() - .map(response.getWebDriver()::findElements) + .map(event.getWebDriver()::findElements) .flatMap(List::stream) .collect(Collectors.toSet()); diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java b/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java similarity index 67% rename from src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java rename to src/main/java/com/github/peterbencze/serritor/internal/EventObject.java index 881e3d6..89d6c33 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java @@ -18,27 +18,27 @@ import com.github.peterbencze.serritor.api.CrawlCandidate; /** - * Base class from which all callback parameters inherit from. + * Base class from which all event objects shall be derived. * * @author Peter Bencze */ -public abstract class CallbackParameter { +public abstract class EventObject { private final CrawlCandidate crawlCandidate; /** - * Base constructor of callback parameters. + * Base constructor of all event objects. * - * @param crawlCandidate The crawled {@link CrawlCandidate} instance + * @param crawlCandidate the current crawl candidate */ - protected CallbackParameter(final CrawlCandidate crawlCandidate) { + protected EventObject(final CrawlCandidate crawlCandidate) { this.crawlCandidate = crawlCandidate; } /** - * Returns the crawl candidate which was crawled by the crawler. + * Returns the current crawl candidate. * - * @return The crawled {@link CrawlCandidate} instance + * @return the current crawl candidate */ public final CrawlCandidate getCrawlCandidate() { return crawlCandidate; diff --git a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java index 86e5fa6..9412b5b 100644 --- a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java +++ b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java @@ -15,7 +15,7 @@ */ package com.github.peterbencze.serritor.api.helper; -import com.github.peterbencze.serritor.api.HtmlResponse; +import com.github.peterbencze.serritor.api.event.PageLoadEvent; import com.github.peterbencze.serritor.api.helper.UrlFinder.UrlFinderBuilder; import java.util.Arrays; import java.util.List; @@ -43,7 +43,7 @@ public final class UrlFinderTest { private static final String URL_WITH_INVALID_DOMAIN = "http://invalid.domain"; private WebDriver mockedDriver; - private HtmlResponse mockedResponse; + private PageLoadEvent mockedEvent; private WebElement mockedElementWithValidUrl; private WebElement mockedElementWithInvalidUrlFormat; private WebElement mockedElementWithInvalidDomain; @@ -51,10 +51,10 @@ public final class UrlFinderTest { @Before public void initialize() { - mockedResponse = Mockito.mock(HtmlResponse.class); + mockedEvent = Mockito.mock(PageLoadEvent.class); mockedDriver = Mockito.mock(WebDriver.class); - Mockito.when(mockedResponse.getWebDriver()) + Mockito.when(mockedEvent.getWebDriver()) .thenReturn(mockedDriver); mockedElementWithValidUrl = Mockito.mock(WebElement.class); @@ -77,7 +77,7 @@ public void initialize() { } @Test - public void testFindUrlsInResponse() { - Assert.assertEquals(Arrays.asList(VALID_URL), urlFinder.findUrlsInResponse(mockedResponse)); + public void testFindUrlsInPage() { + Assert.assertEquals(Arrays.asList(VALID_URL), urlFinder.findUrlsInPage(mockedEvent)); } } From ed60d9f50f462580268d2ba55bb65ff430aec977 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 3 Jun 2018 23:40:44 +0200 Subject: [PATCH 09/28] Rewrite javadocs --- .../peterbencze/serritor/api/BaseCrawler.java | 97 ++++++++++--------- .../serritor/api/CrawlCandidate.java | 42 ++++---- .../serritor/api/CrawlDelayStrategy.java | 3 +- .../serritor/api/CrawlRequest.java | 29 +++--- .../serritor/api/CrawlStrategy.java | 3 +- .../serritor/api/CrawlerConfiguration.java | 88 +++++++++-------- .../serritor/api/helper/UrlFinder.java | 57 ++++++----- .../internal/AdaptiveCrawlDelayMechanism.java | 12 +-- .../internal/CrawlDelayMechanism.java | 12 +-- .../serritor/internal/CrawlDomain.java | 12 +-- .../serritor/internal/CrawlFrontier.java | 46 ++++----- .../internal/FixedCrawlDelayMechanism.java | 8 +- .../internal/RandomCrawlDelayMechanism.java | 8 +- .../serritor/api/helper/UrlFinderTest.java | 26 ++--- .../AdaptiveCrawlDelayMechanismTest.java | 33 +++---- .../serritor/internal/CrawlDomainTest.java | 24 ++--- .../serritor/internal/CrawlFrontierTest.java | 4 +- .../FixedCrawlDelayMechanismTest.java | 10 +- 18 files changed, 262 insertions(+), 252 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 09c88ce..83b0498 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -67,6 +67,11 @@ public abstract class BaseCrawler { private CrawlFrontier crawlFrontier; private CrawlDelayMechanism crawlDelayMechanism; + /** + * Base constructor of all crawlers. + * + * @param config the configuration of the crawler + */ protected BaseCrawler(final CrawlerConfiguration config) { this.config = config; @@ -82,23 +87,23 @@ public final void start() { } /** - * Starts the crawler using the browser specified by the + * Starts the crawler using the browser specified by the given * WebDriver instance. * - * @param driver The WebDriver instance that will be used by - * the crawler + * @param webDriver the WebDriver instance to control the + * browser */ - public final void start(final WebDriver driver) { - start(driver, new CrawlFrontier(config)); + public final void start(final WebDriver webDriver) { + start(webDriver, new CrawlFrontier(config)); } /** - * Constructs all the necessary objects and runs the crawler. + * Initializes and runs the crawler. * - * @param frontierToUse The CrawlFrontier instance to be used - * by the crawler. + * @param crawlFrontier the CrawlFrontier instance to be used + * by the crawler to manage crawl requests */ - private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { + private void start(final WebDriver webDriver, final CrawlFrontier crawlFrontier) { try { Validate.validState(isStopped, "The crawler is already started."); @@ -107,8 +112,8 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { httpClient = HttpClientBuilder.create() .setDefaultCookieStore(cookieStore) .build(); - webDriver = Validate.notNull(driver, "The webdriver cannot be null."); - crawlFrontier = frontierToUse; + this.webDriver = Validate.notNull(webDriver, "The webdriver cannot be null."); + this.crawlFrontier = crawlFrontier; crawlDelayMechanism = createCrawlDelayMechanism(); run(); @@ -122,9 +127,9 @@ private void start(final WebDriver driver, final CrawlFrontier frontierToUse) { } /** - * Saves the current state of the crawler to the specified output stream. + * Saves the current state of the crawler to the given output stream. * - * @param out The OutputStream instance to use + * @param out the output stream */ public final void saveState(final OutputStream out) { // Check if the crawler has been started at least once, otherwise we have nothing to save @@ -137,25 +142,25 @@ public final void saveState(final OutputStream out) { /** * Resumes a previously saved state using HtmlUnit headless browser. * - * @param in The InputStream instance to use + * @param in the input stream from which the state should be loaded */ public final void resumeState(final InputStream in) { resumeState(new HtmlUnitDriver(true), in); } /** - * Resumes a previously saved state using the browser specified by the - * WebDriver instance. + * Resumes a previously saved state using the browser specified by the given + * WebDriver instance. * - * @param driver The WebDriver instance to be used by the - * crawler - * @param in The InputStream instance to use + * @param webDriver the WebDriver instance to control the + * browser + * @param in the input stream from which the state should be loaded */ - public final void resumeState(final WebDriver driver, final InputStream in) { + public final void resumeState(final WebDriver webDriver, final InputStream in) { // Re-create crawl frontier from the saved state - CrawlFrontier frontierToUse = SerializationUtils.deserialize(in); + CrawlFrontier deserializedCrawlFrontier = SerializationUtils.deserialize(in); - start(driver, frontierToUse); + start(webDriver, deserializedCrawlFrontier); } /** @@ -170,12 +175,10 @@ public final void stop() { } /** - * Passes a crawl request to the crawl frontier. The crawler must be - * running, otherwise use - * {@link CrawlerConfiguration.CrawlerConfigurationBuilder#addCrawlSeed(com.github.peterbencze.serritor.api.CrawlRequest)} - * for adding crawl seeds. + * Feeds a crawl request to the crawler. The crawler should be running, + * otherwise the request has to be added as a crawl seed instead. * - * @param request The CrawlRequest instance + * @param request the crawl request */ protected final void crawl(final CrawlRequest request) { Validate.notNull(request, "The request cannot be null."); @@ -185,12 +188,10 @@ protected final void crawl(final CrawlRequest request) { } /** - * Passes multiple crawl requests to the crawl frontier. The crawler must be - * running, otherwise use - * {@link CrawlerConfiguration.CrawlerConfigurationBuilder#addCrawlSeeds(java.util.List)} - * for adding crawl seeds. + * Feeds multiple crawl requests to the crawler. The crawler should be + * running, otherwise the requests have to be added as crawl seeds instead. * - * @param requests The list of CrawlRequest instances + * @param requests the list of crawl requests */ protected final void crawl(final List requests) { requests.forEach(this::crawl); @@ -266,10 +267,12 @@ private void run() { } /** - * Returns a HTTP HEAD response for the given URL. + * Sends an HTTP HEAD request to the given URL and returns the response. * - * @param destinationUrl The URL to crawl - * @return The HTTP HEAD response + * @param destinationUrl the destination URL + * @throws IOException if an error occurs while trying to fulfill the + * request + * @return the HTTP HEAD response */ private HttpResponse getHttpHeadResponse(final URI destinationUrl, final HttpClientContext context) throws IOException { HttpHead headRequest = new HttpHead(destinationUrl.toString()); @@ -277,10 +280,10 @@ private HttpResponse getHttpHeadResponse(final URI destinationUrl, final HttpCli } /** - * Indicates if the content of the response is HTML or not. + * Indicates if the response's content type is HTML. * - * @param httpHeadResponse The HTTP HEAD response - * @return true if the content is HTML, false + * @param httpHeadResponse the HTTP HEAD response + * @return true if the content type is HTML, false * otherwise */ private static boolean isContentHtml(final HttpResponse httpHeadResponse) { @@ -289,9 +292,9 @@ private static boolean isContentHtml(final HttpResponse httpHeadResponse) { } /** - * Constructs the crawl delay mechanism specified in the configuration. + * Creates the crawl delay mechanism according to the configuration. * - * @return The crawl delay mechanism + * @return the created crawl delay mechanism */ private CrawlDelayMechanism createCrawlDelayMechanism() { switch (config.getCrawlDelayStrategy()) { @@ -338,8 +341,8 @@ private void updateClientCookieStore() { /** * Converts a browser cookie to a HTTP client one. * - * @param browserCookie The browser cookie to be converted - * @return The converted HTTP client cookie + * @param browserCookie the browser cookie to be converted + * @return the converted HTTP client cookie */ private static BasicClientCookie convertBrowserCookie(final Cookie browserCookie) { BasicClientCookie clientCookie = new BasicClientCookie(browserCookie.getName(), browserCookie.getValue()); @@ -364,7 +367,7 @@ protected void onStart() { /** * Callback which gets called when the browser loads the page. * - * @param event The {@link PageLoadEvent} instance + * @param event the PageLoadEvent instance */ protected void onPageLoad(final PageLoadEvent event) { } @@ -372,7 +375,7 @@ protected void onPageLoad(final PageLoadEvent event) { /** * Callback which gets called when the content type is not HTML. * - * @param event The {@link NonHtmlContentEvent} instance + * @param event the NonHtmlContentEvent instance */ protected void onNonHtmlContent(final NonHtmlContentEvent event) { } @@ -380,7 +383,7 @@ protected void onNonHtmlContent(final NonHtmlContentEvent event) { /** * Callback which gets called when a request error occurs. * - * @param event The {@link RequestErrorEvent} instance + * @param event the RequestErrorEvent instance */ protected void onRequestError(final RequestErrorEvent event) { } @@ -388,7 +391,7 @@ protected void onRequestError(final RequestErrorEvent event) { /** * Callback which gets called when a request is redirected. * - * @param event The {@link RequestRedirectEvent} instance + * @param event the RequestRedirectEvent instance */ protected void onRequestRedirect(final RequestRedirectEvent event) { } @@ -397,7 +400,7 @@ protected void onRequestRedirect(final RequestRedirectEvent event) { * Callback which gets called when the page does not load in the browser * within the timeout period. * - * @param event The {@link PageLoadTimeoutEvent} instance + * @param event the PageLoadTimeoutEvent instance */ protected void onPageLoadTimeout(final PageLoadTimeoutEvent event) { } diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java index 43571c8..c219c0e 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java @@ -21,7 +21,7 @@ import java.util.Optional; /** - * Represents a candidate to be crawled by the crawler. + * Represents a candidate for crawling. * * @author Peter Bencze */ @@ -40,59 +40,59 @@ private CrawlCandidate(final CrawlCandidateBuilder builder) { /** * Returns the referer URL. * - * @return The URL of the referer + * @return the URL of the referer */ public URI getRefererUrl() { return refererUrl; } /** - * Returns the candidate URL. + * Returns the request URL. * - * @return The URL of the candidate + * @return the URL of the request */ public URI getCandidateUrl() { return crawlRequest.getRequestUrl(); } /** - * Returns the domain of the candidate URL. + * Returns the domain of the request URL. * - * @return The domain of the candidate URL + * @return the domain of the request URL */ public InternetDomainName getDomain() { return crawlRequest.getDomain(); } /** - * Returns the crawl depth of the candidate. + * Returns the crawl depth of the request. * - * @return The crawl depth of the candidate + * @return the crawl depth of the request */ public int getCrawlDepth() { return crawlDepth; } /** - * Returns the priority of the candidate. + * Returns the priority of the request. * - * @return The priority of the candidate + * @return the priority of the request */ public int getPriority() { return crawlRequest.getPriority(); } /** - * Returns the metadata associated with the candidate. + * Returns the metadata associated with the request. * - * @return The metadata associated with the candidate + * @return the metadata associated with the request */ public Optional getMetadata() { return crawlRequest.getMetadata(); } /** - * Builds crawl candidates to be crawled by the crawler. + * Builds {@link CrawlCandidate} instances. */ public static final class CrawlCandidateBuilder { @@ -104,7 +104,7 @@ public static final class CrawlCandidateBuilder { /** * Creates a {@link CrawlCandidateBuilder} instance. * - * @param request The {@link CrawlRequest} instance from which this + * @param request the CrawlRequest instance from which this * candidate is built */ public CrawlCandidateBuilder(final CrawlRequest request) { @@ -114,8 +114,8 @@ public CrawlCandidateBuilder(final CrawlRequest request) { /** * Sets the referer URL. * - * @param refererUrl The referer URL - * @return The {@link CrawlCandidateBuilder} instance + * @param refererUrl the referer URL + * @return the CrawlCandidateBuilder instance */ public CrawlCandidateBuilder setRefererUrl(final URI refererUrl) { this.refererUrl = refererUrl; @@ -123,10 +123,10 @@ public CrawlCandidateBuilder setRefererUrl(final URI refererUrl) { } /** - * Sets the crawl depth of the candidate. + * Sets the crawl depth of the request. * - * @param crawlDepth The crawl depth of the candidate - * @return The {@link CrawlCandidateBuilder} instance + * @param crawlDepth the crawl depth of the request + * @return the CrawlCandidateBuilder instance */ public CrawlCandidateBuilder setCrawlDepth(final int crawlDepth) { this.crawlDepth = crawlDepth; @@ -134,9 +134,9 @@ public CrawlCandidateBuilder setCrawlDepth(final int crawlDepth) { } /** - * Builds the configured {@link CrawlCandidate} instance. + * Builds the configured CrawlCandidate instance. * - * @return The configured {@link CrawlCandidate} instance + * @return the configured CrawlCandidate instance */ public CrawlCandidate build() { return new CrawlCandidate(this); diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java index 0c10e7b..a7bc47b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java @@ -16,7 +16,8 @@ package com.github.peterbencze.serritor.api; /** - * Available crawl delay strategies that can be used by the crawler. + * Available crawl delay strategies which define how the delay between each + * request is determined. * * @author Peter Bencze */ diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 3f0f7a7..08038f2 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -47,7 +47,7 @@ private CrawlRequest(final CrawlRequestBuilder builder) { /** * Returns the request URL. * - * @return The request URL + * @return the request URL */ public URI getRequestUrl() { return requestUrl; @@ -56,7 +56,7 @@ public URI getRequestUrl() { /** * Returns the domain of the request URL. * - * @return The domain of the request URL + * @return the domain of the request URL */ public InternetDomainName getDomain() { return domain; @@ -65,7 +65,7 @@ public InternetDomainName getDomain() { /** * Returns the priority of the request. * - * @return The priority of the request + * @return the priority of the request */ public int getPriority() { return priority; @@ -74,14 +74,14 @@ public int getPriority() { /** * Returns the metadata associated with the request. * - * @return The metadata associated with the request + * @return the metadata associated with the request */ public Optional getMetadata() { return Optional.ofNullable(metadata); } /** - * Builds crawl requests which can be fed to the crawler. + * Builds {@link CrawlRequest} instances. */ public static final class CrawlRequestBuilder { @@ -96,7 +96,7 @@ public static final class CrawlRequestBuilder { /** * Creates a {@link CrawlRequestBuilder} instance. * - * @param requestUrl The request URL + * @param requestUrl the request URL */ public CrawlRequestBuilder(final URI requestUrl) { this.requestUrl = requestUrl; @@ -111,7 +111,7 @@ public CrawlRequestBuilder(final URI requestUrl) { /** * Creates a {@link CrawlRequestBuilder} instance. * - * @param requestUrl The request URL + * @param requestUrl the request URL */ public CrawlRequestBuilder(final String requestUrl) { this(URI.create(requestUrl)); @@ -120,9 +120,9 @@ public CrawlRequestBuilder(final String requestUrl) { /** * Sets the priority of the request. * - * @param priority The priority of the request (higher number means + * @param priority the priority of the request (higher number means * higher priority) - * @return The {@link CrawlRequestBuilder} instance + * @return the CrawlRequestBuilder instance */ public CrawlRequestBuilder setPriority(final int priority) { this.priority = priority; @@ -130,11 +130,10 @@ public CrawlRequestBuilder setPriority(final int priority) { } /** - * Sets the metadata of the request which can be later accessed when the - * crawler completed the request. + * Sets the metadata associated with the request. * - * @param metadata The metadata associated with the request - * @return The {@link CrawlRequestBuilder} instance + * @param metadata the metadata associated with the request + * @return the CrawlRequestBuilder instance */ public CrawlRequestBuilder setMetadata(final Serializable metadata) { this.metadata = Validate.notNull(metadata, "The metadata cannot be null."); @@ -142,9 +141,9 @@ public CrawlRequestBuilder setMetadata(final Serializable metadata) { } /** - * Builds the configured {@link CrawlRequest} instance. + * Builds the configured CrawlRequest instance. * - * @return The configured {@link CrawlRequest} instance + * @return the configured CrawlRequest instance */ public CrawlRequest build() { return new CrawlRequest(this); diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java index c88435b..c505932 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java @@ -16,7 +16,8 @@ package com.github.peterbencze.serritor.api; /** - * Available strategies that can be used while crawling. + * Available crawl strategies that define the order in which crawl requests are + * processed. * * @author Peter Bencze */ diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java index 8cdaa71..bcc2d25 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java @@ -25,7 +25,7 @@ import org.apache.commons.lang3.Validate; /** - * This class contains the settings of the crawler. + * Contains the settings of the crawler. * * @author Peter Bencze */ @@ -58,7 +58,7 @@ private CrawlerConfiguration(final CrawlerConfigurationBuilder builder) { /** * Returns the set of allowed crawl domains. * - * @return The set of allowed crawl domains + * @return the set of allowed crawl domains */ public Set getAllowedCrawlDomains() { return allowedCrawlDomains; @@ -67,7 +67,7 @@ public Set getAllowedCrawlDomains() { /** * Returns the set of crawl seeds. * - * @return The set of crawl seeds + * @return the set of crawl seeds */ public Set getCrawlSeeds() { return crawlSeeds; @@ -76,14 +76,14 @@ public Set getCrawlSeeds() { /** * Returns the crawl strategy of the crawler. * - * @return The crawl strategy + * @return the crawl strategy of the crawler */ public CrawlStrategy getCrawlStrategy() { return crawlStrategy; } /** - * Indicates if duplicate request filtering is enabled or not. + * Indicates if duplicate request filtering is enabled. * * @return true if enabled, false otherwise */ @@ -92,7 +92,7 @@ public boolean isDuplicateRequestFilteringEnabled() { } /** - * Indicates if offsite request filtering is enabled or not. + * Indicates if offsite request filtering is enabled. * * @return true if enabled, false otherwise */ @@ -101,18 +101,18 @@ public boolean isOffsiteRequestFilteringEnabled() { } /** - * Returns the maximum possible crawl depth. + * Returns the maximum crawl depth. * - * @return The maximum crawl depth + * @return the maximum crawl depth */ public int getMaximumCrawlDepth() { return maxCrawlDepth; } /** - * Returns the crawl delay strategy used by the crawler. + * Returns the crawl delay strategy of the crawler. * - * @return The crawl delay strategy + * @return the crawl delay strategy of the crawler */ public CrawlDelayStrategy getCrawlDelayStrategy() { return crawlDelayStrategy; @@ -121,7 +121,7 @@ public CrawlDelayStrategy getCrawlDelayStrategy() { /** * Returns the exact duration of delay between each request. * - * @return The duration of delay in milliseconds + * @return the duration of delay in milliseconds */ public long getFixedCrawlDelayDurationInMillis() { return fixedCrawlDelayDurationInMillis; @@ -130,7 +130,7 @@ public long getFixedCrawlDelayDurationInMillis() { /** * Returns the minimum duration of delay between each request. * - * @return The minimum duration of delay in milliseconds + * @return the minimum duration of delay in milliseconds */ public long getMinimumCrawlDelayDurationInMillis() { return minCrawlDelayDurationInMillis; @@ -139,12 +139,15 @@ public long getMinimumCrawlDelayDurationInMillis() { /** * Returns the maximum duration of delay between each request. * - * @return The maximum duration of delay in milliseconds + * @return the maximum duration of delay in milliseconds */ public long getMaximumCrawlDelayDurationInMillis() { return maxCrawlDelayDurationInMillis; } + /** + * Builds {@link CrawlerConfiguration} instances. + */ public static final class CrawlerConfigurationBuilder { private static final CrawlStrategy DEFAULT_CRAWL_STRATEGY = CrawlStrategy.BREADTH_FIRST; @@ -168,6 +171,9 @@ public static final class CrawlerConfigurationBuilder { private long minCrawlDelayDurationInMillis; private long maxCrawlDelayDurationInMillis; + /** + * Creates a {@link CrawlerConfigurationBuilder} instance. + */ public CrawlerConfigurationBuilder() { // Initialize with default values allowedCrawlDomains = new HashSet<>(); @@ -185,8 +191,8 @@ public CrawlerConfigurationBuilder() { /** * Appends an internet domain to the list of allowed crawl domains. * - * @param allowedCrawlDomain A well-formed internet domain name - * @return The CrawlerConfigurationBuilder instance + * @param allowedCrawlDomain a well-formed internet domain name + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder addAllowedCrawlDomain(final String allowedCrawlDomain) { InternetDomainName domain = InternetDomainName.from(allowedCrawlDomain); @@ -201,9 +207,9 @@ public CrawlerConfigurationBuilder addAllowedCrawlDomain(final String allowedCra * Appends a list of internet domains to the list of allowed crawl * domains. * - * @param allowedCrawlDomains A list of well-formed internet domain + * @param allowedCrawlDomains a list of well-formed internet domain * names - * @return The CrawlerConfigurationBuilder instance + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder addAllowedCrawlDomains(final List allowedCrawlDomains) { allowedCrawlDomains.forEach(this::addAllowedCrawlDomain); @@ -213,9 +219,8 @@ public CrawlerConfigurationBuilder addAllowedCrawlDomains(final List all /** * Appends a crawl request to the set of crawl seeds. * - * @param request The CrawlRequest instance which - * represents the crawl seed - * @return The CrawlerConfigurationBuilder instance + * @param request the crawl request which represents a crawl seed + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder addCrawlSeed(final CrawlRequest request) { Validate.notNull(request, "The request cannot be null."); @@ -227,9 +232,9 @@ public CrawlerConfigurationBuilder addCrawlSeed(final CrawlRequest request) { /** * Appends a list of crawl requests to the set of crawl seeds. * - * @param requests The list of CrawlRequest instances which - * represent the crawl seeds - * @return The CrawlerConfigurationBuilder instance + * @param requests the list of crawl requests which represent crawl + * seeds + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder addCrawlSeeds(final List requests) { requests.forEach(this::addCrawlSeed); @@ -241,8 +246,8 @@ public CrawlerConfigurationBuilder addCrawlSeeds(final List reques * strategy orders crawl requests by the lowest crawl depth, whereas * depth-first orders them by the highest crawl depth. * - * @param strategy The crawl strategy - * @return The CrawlerConfigurationBuilder instance + * @param strategy the crawl strategy + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder setCrawlStrategy(final CrawlStrategy strategy) { Validate.notNull(strategy, "The strategy cannot be null."); @@ -256,7 +261,7 @@ public CrawlerConfigurationBuilder setCrawlStrategy(final CrawlStrategy strategy * * @param filterDuplicateRequests true means enabled, * false means disabled - * @return The CrawlerConfigurationBuilder instance + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder setDuplicateRequestFiltering(final boolean filterDuplicateRequests) { this.filterDuplicateRequests = filterDuplicateRequests; @@ -268,7 +273,7 @@ public CrawlerConfigurationBuilder setDuplicateRequestFiltering(final boolean fi * * @param filterOffsiteRequests true means enabled, * false means disabled - * @return The CrawlerConfigurationBuilder instance + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder setOffsiteRequestFiltering(final boolean filterOffsiteRequests) { this.filterOffsiteRequests = filterOffsiteRequests; @@ -276,11 +281,11 @@ public CrawlerConfigurationBuilder setOffsiteRequestFiltering(final boolean filt } /** - * Sets the maximum possible crawl depth. It should be a non-negative - * number where 0 means there is no limit. + * Sets the maximum crawl depth. It should be a non-negative + * number (0 means no limit). * - * @param maxCrawlDepth The maximum crawl depth - * @return The CrawlerConfigurationBuilder instance + * @param maxCrawlDepth the maximum crawl depth + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder setMaximumCrawlDepth(final int maxCrawlDepth) { Validate.isTrue(maxCrawlDepth >= 0, "The maximum crawl depth cannot be negative."); @@ -290,10 +295,11 @@ public CrawlerConfigurationBuilder setMaximumCrawlDepth(final int maxCrawlDepth) } /** - * Sets the crawl delay strategy to be used by the crawler. + * Sets the crawl delay strategy to be used by the crawler. This + * strategy defines how the delay between each request is determined. * - * @param strategy The crawl delay strategy - * @return The CrawlerConfigurationBuilder instance + * @param strategy the crawl delay strategy + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder setCrawlDelayStrategy(final CrawlDelayStrategy strategy) { Validate.notNull(strategy, "The strategy cannot be null."); @@ -305,8 +311,8 @@ public CrawlerConfigurationBuilder setCrawlDelayStrategy(final CrawlDelayStrateg /** * Sets the exact duration of delay between each request. * - * @param fixedCrawlDelayDuration The duration of delay - * @return The CrawlerConfigurationBuilder instance + * @param fixedCrawlDelayDuration the duration of delay + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) { Validate.notNull(fixedCrawlDelayDuration, "The duration cannot be null."); @@ -318,8 +324,8 @@ public CrawlerConfigurationBuilder setFixedCrawlDelayDuration(final Duration fix /** * Sets the minimum duration of delay between each request. * - * @param minCrawlDelayDuration The minimum duration of delay - * @return The CrawlerConfigurationBuilder instance + * @param minCrawlDelayDuration the minimum duration of delay + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) { Validate.notNull(minCrawlDelayDuration, "The duration cannot be null."); @@ -336,8 +342,8 @@ public CrawlerConfigurationBuilder setMinimumCrawlDelayDuration(final Duration m /** * Sets the maximum duration of delay between each request. * - * @param maxCrawlDelayDuration The maximum duration of delay - * @return The CrawlerConfigurationBuilder instance + * @param maxCrawlDelayDuration the maximum duration of delay + * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) { Validate.notNull(maxCrawlDelayDuration, "The duration cannot be null."); @@ -353,7 +359,7 @@ public CrawlerConfigurationBuilder setMaximumCrawlDelayDuration(final Duration m /** * Builds the configured CrawlerConfiguration instance. * - * @return The configured CrawlerConfiguration instance + * @return the configured CrawlerConfiguration instance */ public CrawlerConfiguration build() { return new CrawlerConfiguration(this); diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java index 8347654..33946e2 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -34,8 +34,7 @@ import org.openqa.selenium.WebElement; /** - * A helper class which can be used to find URLs in HTML sources using regular - * expressions. + * Finds URLs in HTML page sources using regular expressions. * * @author Peter Bencze */ @@ -56,8 +55,8 @@ private UrlFinder(final UrlFinderBuilder builder) { /** * Returns a list of validated URLs found in the page's HTML source. * - * @param event the {@link PageLoadEvent} instance - * @return the list of found URLs in the page's HTML source + * @param event the PageLoadEvent instance + * @return the list of found URLs */ public List findUrlsInPage(final PageLoadEvent event) { Set foundUrls = new HashSet<>(); @@ -85,8 +84,8 @@ public List findUrlsInPage(final PageLoadEvent event) { /** * Returns a list of validated URLs found in the attribute's value. * - * @param attributeValue The value of the attribute - * @return The list of found URLs + * @param attributeValue the value of the attribute + * @return the list of found URLs */ private List findUrlsInAttributeValue(final String attributeValue) { List foundUrls = new ArrayList<>(); @@ -106,8 +105,11 @@ private List findUrlsInAttributeValue(final String attributeValue) { return foundUrls; } + /** + * Builds {@link UrlFinder} instances. + */ public static final class UrlFinderBuilder { - + private static final Set DEFAULT_LOCATING_MECHANISMS = Sets.newHashSet(By.tagName("a")); private static final Set DEFAULT_ATTRIBUTES = Sets.newHashSet("href"); private static final Predicate DEFAULT_VALIDATOR = UrlFinderBuilder::isValidUrl; @@ -119,21 +121,18 @@ public static final class UrlFinderBuilder { private Predicate validator; /** - * Constructs a UrlFinderBuilder instance that can be used - * to create UrlFinder instances. + * Creates a {@link UrlFinderBuilder} instance. * - * @param urlPattern The pattern which will be used to find URLs + * @param urlPattern the pattern to use to find URLs */ public UrlFinderBuilder(final Pattern urlPattern) { this(Arrays.asList(urlPattern)); } /** - * Constructs a UrlFinderBuilder instance that can be used - * to create UrlFinder instances. It + * Creates a {@link UrlFinderBuilder} instance. * - * @param urlPatterns The list of patterns which will be used to find - * URLs + * @param urlPatterns the list of patterns to use to find URLs */ public UrlFinderBuilder(final List urlPatterns) { Validate.noNullElements(urlPatterns, "URL patterns cannot be null."); @@ -148,9 +147,9 @@ public UrlFinderBuilder(final List urlPatterns) { * Sets the locating mechanism used by the finder. Only elements matched * by the locator will be considered when searching for URLs. * - * @param locatingMechanism The By locating mechanism + * @param locatingMechanism the By locating mechanism * instance - * @return The UrlFinderBuilder instance + * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setLocatingMechanism(final By locatingMechanism) { return setLocatingMechanisms(Arrays.asList(locatingMechanism)); @@ -160,9 +159,9 @@ public UrlFinderBuilder setLocatingMechanism(final By locatingMechanism) { * Sets the locating mechanisms used by the finder. Only elements * matched by the locators will be considered when searching for URLs. * - * @param locatingMechanisms The list of By locating + * @param locatingMechanisms the list of By locating * mechanism instances - * @return The UrlFinderBuilder instance + * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setLocatingMechanisms(final List locatingMechanisms) { Validate.noNullElements(locatingMechanisms, "Locating mechanisms cannot be null."); @@ -172,10 +171,10 @@ public UrlFinderBuilder setLocatingMechanisms(final List locatingMechanisms) } /** - * Sets which attributes to search for URLs. + * Sets the list of attribute names to search for URLs. * - * @param attributes The list of attribute names - * @return The UrlFinderBuilder instance + * @param attributes the list of attribute names + * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setAttributes(final List attributes) { Validate.noNullElements(attributes, "Attributes cannot be null."); @@ -185,10 +184,10 @@ public UrlFinderBuilder setAttributes(final List attributes) { } /** - * Sets which attribute to search for URLs. + * Sets the attribute name to search for URLs. * - * @param attribute The name of the attribute - * @return The UrlFinderBuilder instance + * @param attribute the attribute name + * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setAttribute(final String attribute) { return setAttributes(Arrays.asList(attribute)); @@ -197,8 +196,8 @@ public UrlFinderBuilder setAttribute(final String attribute) { /** * Sets a predicate to be used for validating found URLs. * - * @param validator The validator predicate - * @return The UrlFinderBuilder instance + * @param validator the validator predicate + * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setValidator(final Predicate validator) { Validate.notNull(validator, "The validator function cannot be null."); @@ -208,9 +207,9 @@ public UrlFinderBuilder setValidator(final Predicate validator) { } /** - * Builds the configured URL finder. + * Builds the configured UrlFinder instance. * - * @return The configured UrlFinder instance + * @return the configured UrlFinder instance */ public UrlFinder build() { return new UrlFinder(this); @@ -219,7 +218,7 @@ public UrlFinder build() { /** * The default URL validator function. * - * @param url The URL to be validated + * @param url the URL to validate * @return true if the URL is valid, false * otherwise */ diff --git a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java index dfedfdb..06b0df1 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java @@ -32,12 +32,12 @@ public final class AdaptiveCrawlDelayMechanism implements CrawlDelayMechanism { private final JavascriptExecutor jsExecutor; /** - * Constructs a new AdaptiveCrawlDelayMechanism instance. + * Creates an {@link AdaptiveCrawlDelayMechanism} instance. * - * @param config The CrawlerConfiguration instance which - * specifies the minimum and maximum delay. - * @param jsExecutor The WebDriver instance which is capable of - * executing JavaScript. + * @param config the crawler configuration which specifies the minimum and + * maximum delay + * @param jsExecutor the {@link WebDriver} instance which is capable of + * executing JavaScript */ public AdaptiveCrawlDelayMechanism(final CrawlerConfiguration config, final JavascriptExecutor jsExecutor) { minDelayInMillis = config.getMinimumCrawlDelayDurationInMillis(); @@ -61,7 +61,7 @@ public boolean isBrowserCompatible() { * than the minimum, it returns the minimum delay. If the calculated delay * is higher than the maximum, it returns the maximum delay. * - * @return The delay in milliseconds + * @return the delay in milliseconds */ @Override public long getDelay() { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java index 34317b1..cbb4634 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java @@ -16,16 +16,16 @@ package com.github.peterbencze.serritor.internal; /** - * An interface that every crawl delay mechanism should implement. - * + * An interface which should be implemented by every crawl delay mechanism. + * * @author Peter Bencze */ public interface CrawlDelayMechanism { - + /** - * Returns the delay that should pass between each request. - * - * @return The duration of delay in milliseconds + * Returns the delay which should pass between each request. + * + * @return the duration of delay in milliseconds */ long getDelay(); } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java index 89bba42..7fb007e 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java @@ -29,19 +29,19 @@ public final class CrawlDomain implements Serializable { private final ImmutableList parts; /** - * Constructs a new CrawlDomain instance. + * Creates a CrawlDomain instance. * - * @param domain An immutable well-formed internet domain name + * @param domain an immutable well-formed internet domain name */ public CrawlDomain(final InternetDomainName domain) { parts = domain.parts(); } /** - * Indicates if two CrawlDomain instances are equal or not. + * Indicates if two CrawlDomain instances are equal. * Crawl domains with the same domain name are considered equal. * - * @param obj A CrawlDomain instance + * @param obj a CrawlDomain instance * @return true if equal, false otherwise */ @Override @@ -62,7 +62,7 @@ public boolean equals(final Object obj) { * Calculates the hash code from the individual components of the domain * name. * - * @return The hash code for the crawl domain + * @return the hash code for the crawl domain */ @Override public int hashCode() { @@ -72,7 +72,7 @@ public int hashCode() { /** * Indicates if this crawl domain contains the specific internet domain. * - * @param domain An immutable well-formed internet domain name + * @param domain an immutable well-formed internet domain name * @return true if belongs, false otherwise */ public boolean contains(final InternetDomainName domain) { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index c38cd3e..af7c839 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -16,9 +16,9 @@ package com.github.peterbencze.serritor.internal; import com.github.peterbencze.serritor.api.CrawlCandidate; -import com.github.peterbencze.serritor.api.CrawlerConfiguration; -import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlCandidate.CrawlCandidateBuilder; +import com.github.peterbencze.serritor.api.CrawlRequest; +import com.github.peterbencze.serritor.api.CrawlerConfiguration; import java.io.Serializable; import java.net.URI; import java.util.Arrays; @@ -32,8 +32,7 @@ import org.apache.commons.codec.digest.DigestUtils; /** - * Provides an interface for the crawler to manage crawl requests while - * crawling. + * Manages crawl requests and provides crawl candidates to the crawler. * * @author Peter Bencze */ @@ -48,11 +47,16 @@ public final class CrawlFrontier implements Serializable { private CrawlCandidate currentCandidate; + /** + * Creates a {@link CrawlFrontier} instance. + * + * @param config the crawler configuration + */ public CrawlFrontier(final CrawlerConfiguration config) { this.config = config; allowedCrawlDomains = config.getAllowedCrawlDomains(); - + urlFingerprints = new HashSet<>(); // Construct a priority queue according to the crawl strategy specified in the configuration @@ -68,23 +72,22 @@ public CrawlFrontier(final CrawlerConfiguration config) { /** * Feeds a crawl request to the frontier. * - * @param request The CrawlRequest instance to be fed - * @param isCrawlSeed true if the request is a crawl seed, - * false otherwise + * @param request the crawl request + * @param isCrawlSeed indicates if the request is a crawl seed */ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { if (config.isOffsiteRequestFilteringEnabled()) { // Check if the request's domain is in the allowed crawl domains - + boolean inCrawlDomain = false; - + for (CrawlDomain allowedCrawlDomain : allowedCrawlDomains) { if (allowedCrawlDomain.contains(request.getDomain())) { inCrawlDomain = true; break; } } - + if (!inCrawlDomain) { return; } @@ -92,10 +95,9 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { if (config.isDuplicateRequestFilteringEnabled()) { // Check if the URL has already been crawled - + String urlFingerprint = createFingerprintForUrl(request.getRequestUrl()); - if (urlFingerprints.contains(urlFingerprint)) { return; } @@ -135,9 +137,9 @@ public boolean hasNextCandidate() { } /** - * Gets the next candidate from the queue. + * Returns the next crawl candidate from the queue. * - * @return The next CrawlCandidate instance + * @return the next crawl candidate from the queue */ public CrawlCandidate getNextCandidate() { currentCandidate = candidates.poll(); @@ -147,11 +149,11 @@ public CrawlCandidate getNextCandidate() { /** * Creates the fingerprint of the given URL. * - * @param url The URL that the fingerprint will be created for - * @return The fingerprint of the URL + * @param url the URL for which the fingerprint is created + * @return the fingerprint of the URL */ private static String createFingerprintForUrl(final URI url) { - // First, we start off with the host only + // We start off with the host only StringBuilder truncatedUrl = new StringBuilder(url.getHost()); // If there is a path in the URL, we append it after the host @@ -174,15 +176,15 @@ private static String createFingerprintForUrl(final URI url) { .forEachOrdered(truncatedUrl::append); } - // Finally, create the SHA-256 hash return DigestUtils.sha256Hex(truncatedUrl.toString()); } /** - * Creates a new priority queue using the specified strategy. + * Creates a priority queue using the strategy specified in the + * configuration. * - * @return The PriorityQueue instance for crawl requests using - * the given comparator + * @return the priority queue using the strategy specified in the + * configuration */ private PriorityQueue createPriorityQueue() { switch (config.getCrawlStrategy()) { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java index a3f84c8..2dcfa95 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java @@ -28,10 +28,10 @@ public final class FixedCrawlDelayMechanism implements CrawlDelayMechanism { private final long delayInMillis; /** - * Constructs a new FixedCrawlDelayMechanism instance. + * Creates a {@link FixedCrawlDelayMechanism} instance. * - * @param config The CrawlerConfiguration instance which - * specifies the fixed delay duration. + * @param config the crawler configuration which specifies the fixed delay + * duration */ public FixedCrawlDelayMechanism(final CrawlerConfiguration config) { this.delayInMillis = config.getFixedCrawlDelayDurationInMillis(); @@ -40,7 +40,7 @@ public FixedCrawlDelayMechanism(final CrawlerConfiguration config) { /** * Returns the fixed delay specified in the configuration. * - * @return The delay in milliseconds + * @return the delay in milliseconds */ @Override public long getDelay() { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java index f8a7446..6353884 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java @@ -30,10 +30,10 @@ public final class RandomCrawlDelayMechanism implements CrawlDelayMechanism { private final long upperLimit; /** - * Constructs a new RandomCrawlDelayMechanism instance. + * Creates a {@link RandomCrawlDelayMechanism} instance. * - * @param config The CrawlerConfiguration instance which - * specifies the minimum and maximum delay. + * @param config the crawler configuration which specifies the minimum and + * maximum delay. */ public RandomCrawlDelayMechanism(final CrawlerConfiguration config) { lowerLimit = config.getMinimumCrawlDelayDurationInMillis(); @@ -44,7 +44,7 @@ public RandomCrawlDelayMechanism(final CrawlerConfiguration config) { * Returns a random delay between the minimum and maximum range specified in * the configuration. * - * @return The delay in milliseconds + * @return the delay in milliseconds */ @Override public long getDelay() { diff --git a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java index 9412b5b..1758b17 100644 --- a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java +++ b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java @@ -29,50 +29,50 @@ import org.openqa.selenium.WebElement; /** - * Test cases for UrlFinder. + * Test cases for {@link UrlFinder}. * * @author Peter Bencze */ public final class UrlFinderTest { - + private static final Pattern URL_PATTERN = Pattern.compile(".+valid-url.+"); private static final String ATTRIBUTE = "href"; private static final String TAG_NAME = "a"; private static final String VALID_URL = "http://valid-url.com"; private static final String INVALID_URL = "invalid-url"; private static final String URL_WITH_INVALID_DOMAIN = "http://invalid.domain"; - + private WebDriver mockedDriver; private PageLoadEvent mockedEvent; private WebElement mockedElementWithValidUrl; private WebElement mockedElementWithInvalidUrlFormat; - private WebElement mockedElementWithInvalidDomain; + private WebElement mockedElementWithInvalidDomain; private UrlFinder urlFinder; @Before public void initialize() { mockedEvent = Mockito.mock(PageLoadEvent.class); - + mockedDriver = Mockito.mock(WebDriver.class); Mockito.when(mockedEvent.getWebDriver()) - .thenReturn(mockedDriver); - + .thenReturn(mockedDriver); + mockedElementWithValidUrl = Mockito.mock(WebElement.class); Mockito.when(mockedElementWithValidUrl.getAttribute(Mockito.eq(ATTRIBUTE))) - .thenReturn(VALID_URL); - + .thenReturn(VALID_URL); + mockedElementWithInvalidUrlFormat = Mockito.mock(WebElement.class); Mockito.when(mockedElementWithInvalidUrlFormat.getAttribute(Mockito.eq(ATTRIBUTE))) - .thenReturn(INVALID_URL); - + .thenReturn(INVALID_URL); + mockedElementWithInvalidDomain = Mockito.mock(WebElement.class); Mockito.when(mockedElementWithInvalidDomain.getAttribute(Mockito.eq(ATTRIBUTE))) .thenReturn(URL_WITH_INVALID_DOMAIN); - List elementList = Arrays.asList(mockedElementWithValidUrl, mockedElementWithInvalidUrlFormat, mockedElementWithInvalidDomain); + List elementList = Arrays.asList(mockedElementWithValidUrl, mockedElementWithInvalidUrlFormat, mockedElementWithInvalidDomain); Mockito.when(mockedDriver.findElements(By.tagName(TAG_NAME))) .thenReturn(elementList); - + urlFinder = new UrlFinderBuilder(URL_PATTERN).build(); } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanismTest.java b/src/test/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanismTest.java index 60d5b3e..aef99c0 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanismTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanismTest.java @@ -15,7 +15,6 @@ */ package com.github.peterbencze.serritor.internal; - import com.github.peterbencze.serritor.api.CrawlerConfiguration; import java.time.Duration; import org.junit.Assert; @@ -25,61 +24,61 @@ import org.openqa.selenium.JavascriptExecutor; /** - * Test cases for AdaptiveCrawlDelayMechanism. - * + * Test cases for {@link AdaptiveCrawlDelayMechanism}. + * * @author Peter Bencze */ public final class AdaptiveCrawlDelayMechanismTest { - + private static final long LOWER_DELAY_DURATION_IN_MILLIS = Duration.ZERO.toMillis(); - private static final long MINIMUM_DELAY_DURATION_IN_MILLIS = Duration.ofSeconds(1).toMillis(); + private static final long MINIMUM_DELAY_DURATION_IN_MILLIS = Duration.ofSeconds(1).toMillis(); private static final long IN_RANGE_DELAY_DURATION_IN_MILLIS = Duration.ofSeconds(2).toMillis(); private static final long MAXIMUM_DELAY_DURATION_IN_MILLIS = Duration.ofSeconds(3).toMillis(); private static final long HIGHER_DELAY_DURATION_IN_MILLIS = Duration.ofSeconds(4).toMillis(); - + private CrawlerConfiguration mockedConfig; - private JavascriptExecutor mockedJsExecutor; + private JavascriptExecutor mockedJsExecutor; private AdaptiveCrawlDelayMechanism crawlDelayMechanism; - + @Before public void initialize() { mockedConfig = Mockito.mock(CrawlerConfiguration.class); Mockito.when(mockedConfig.getMinimumCrawlDelayDurationInMillis()) - .thenReturn(MINIMUM_DELAY_DURATION_IN_MILLIS); + .thenReturn(MINIMUM_DELAY_DURATION_IN_MILLIS); Mockito.when(mockedConfig.getMaximumCrawlDelayDurationInMillis()) .thenReturn(MAXIMUM_DELAY_DURATION_IN_MILLIS); - + mockedJsExecutor = Mockito.mock(JavascriptExecutor.class); - + crawlDelayMechanism = new AdaptiveCrawlDelayMechanism(mockedConfig, mockedJsExecutor); } - + @Test public void testDelayLowerThanMinimum() { // Return a delay which is lower than the predefined minimum Mockito.when(mockedJsExecutor.executeScript(Mockito.anyString())) .thenReturn(LOWER_DELAY_DURATION_IN_MILLIS); - + // The minimum delay should be returned Assert.assertEquals(mockedConfig.getMinimumCrawlDelayDurationInMillis(), crawlDelayMechanism.getDelay()); } - + @Test public void testDelayHigherThanMaximum() { // Return a delay which is higher than the predefined maximum Mockito.when(mockedJsExecutor.executeScript(Mockito.anyString())) .thenReturn(HIGHER_DELAY_DURATION_IN_MILLIS); - + // The maximum delay should be returned Assert.assertEquals(mockedConfig.getMaximumCrawlDelayDurationInMillis(), crawlDelayMechanism.getDelay()); } - + @Test public void testDelayBetweenRange() { // Return an in range delay Mockito.when(mockedJsExecutor.executeScript(Mockito.anyString())) .thenReturn(IN_RANGE_DELAY_DURATION_IN_MILLIS); - + // The in range delay should be returned Assert.assertEquals(IN_RANGE_DELAY_DURATION_IN_MILLIS, crawlDelayMechanism.getDelay()); } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java index 8226d10..81d390d 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java @@ -20,46 +20,46 @@ import org.junit.Test; /** - * Test cases for CrawlDomain. - * + * Test cases for {@link CrawlDomain}. + * * @author Peter Bencze */ public final class CrawlDomainTest { - + private static final InternetDomainName DOMAIN = InternetDomainName.from("test.com"); private static final InternetDomainName SUBDOMAIN = InternetDomainName.from("sub.test.com"); - + private static final int DOMAIN_PARTS_HASHCODE = DOMAIN.parts().hashCode(); - + private static final CrawlDomain CRAWL_DOMAIN_0 = new CrawlDomain(DOMAIN); private static final CrawlDomain CRAWL_DOMAIN_1 = new CrawlDomain(DOMAIN); private static final CrawlDomain CRAWL_DOMAIN_2 = new CrawlDomain(SUBDOMAIN); - + @Test public void testEquals() { // A crawl domain should be equal with itself Assert.assertEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_0); - + // Crawl domains with the same domain should be equal Assert.assertEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_1); - + // Crawl domains with different domains should not be equal Assert.assertNotEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_2); } - + @Test public void testHashCode() { Assert.assertEquals(DOMAIN_PARTS_HASHCODE, CRAWL_DOMAIN_0.hashCode()); } - + @Test public void testContains() { // A crawl domain should contain its own domain Assert.assertTrue(CRAWL_DOMAIN_0.contains(DOMAIN)); - + // A crawl domain should contain its own domain's subdomain Assert.assertTrue(CRAWL_DOMAIN_0.contains(SUBDOMAIN)); - + // A crawl domain should not contain a domain different from its own domain Assert.assertFalse(CRAWL_DOMAIN_2.contains(DOMAIN)); } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 6e38a26..0413ffa 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -16,10 +16,10 @@ package com.github.peterbencze.serritor.internal; import com.github.peterbencze.serritor.api.CrawlCandidate; -import com.github.peterbencze.serritor.api.CrawlerConfiguration; import com.github.peterbencze.serritor.api.CrawlRequest; import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; import com.github.peterbencze.serritor.api.CrawlStrategy; +import com.github.peterbencze.serritor.api.CrawlerConfiguration; import com.github.peterbencze.serritor.api.CrawlerConfiguration.CrawlerConfigurationBuilder; import java.net.URI; import java.util.Arrays; @@ -31,7 +31,7 @@ import org.mockito.Mockito; /** - * Test cases for CrawlFrontier. + * Test cases for {@link CrawlFrontier}. * * @author Peter Bencze */ diff --git a/src/test/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanismTest.java b/src/test/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanismTest.java index b2955bc..6ac7ed4 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanismTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanismTest.java @@ -23,21 +23,21 @@ import org.mockito.Mockito; /** - * Test cases for FixedCrawlDelayMechanism. - * + * Test cases for {@link FixedCrawlDelayMechanism}. + * * @author Peter Bencze */ public class FixedCrawlDelayMechanismTest { - + private CrawlerConfiguration config; private FixedCrawlDelayMechanism crawlDelayMechanism; - + @Before public void initialize() { config = Mockito.spy(new CrawlerConfigurationBuilder().build()); crawlDelayMechanism = new FixedCrawlDelayMechanism(config); } - + @Test public void testGetDelay() { // The delay should be the same as in the configuration From 024d6c0570d158dbceebf8c3086f0b682da8966e Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 3 Jun 2018 23:43:46 +0200 Subject: [PATCH 10/28] Fix javadoc link --- .../serritor/internal/AdaptiveCrawlDelayMechanism.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java index 06b0df1..1c95712 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java @@ -36,8 +36,8 @@ public final class AdaptiveCrawlDelayMechanism implements CrawlDelayMechanism { * * @param config the crawler configuration which specifies the minimum and * maximum delay - * @param jsExecutor the {@link WebDriver} instance which is capable of - * executing JavaScript + * @param jsExecutor the {@link org.openqa.selenium.WebDriver} instance + * which is capable of executing JavaScript */ public AdaptiveCrawlDelayMechanism(final CrawlerConfiguration config, final JavascriptExecutor jsExecutor) { minDelayInMillis = config.getMinimumCrawlDelayDurationInMillis(); From d658886dc114bf618d402750431cac554ef26263 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sun, 3 Jun 2018 23:56:56 +0200 Subject: [PATCH 11/28] Refactor crawl delay mechanisms to a separate package --- .../com/github/peterbencze/serritor/api/BaseCrawler.java | 8 ++++---- .../AdaptiveCrawlDelayMechanism.java | 2 +- .../{ => crawldelaymechanism}/CrawlDelayMechanism.java | 2 +- .../FixedCrawlDelayMechanism.java | 2 +- .../RandomCrawlDelayMechanism.java | 2 +- .../AdaptiveCrawlDelayMechanismTest.java | 2 +- .../FixedCrawlDelayMechanismTest.java | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) rename src/main/java/com/github/peterbencze/serritor/internal/{ => crawldelaymechanism}/AdaptiveCrawlDelayMechanism.java (97%) rename src/main/java/com/github/peterbencze/serritor/internal/{ => crawldelaymechanism}/CrawlDelayMechanism.java (92%) rename src/main/java/com/github/peterbencze/serritor/internal/{ => crawldelaymechanism}/FixedCrawlDelayMechanism.java (95%) rename src/main/java/com/github/peterbencze/serritor/internal/{ => crawldelaymechanism}/RandomCrawlDelayMechanism.java (96%) rename src/test/java/com/github/peterbencze/serritor/internal/{ => crawldelaymechanism}/AdaptiveCrawlDelayMechanismTest.java (97%) rename src/test/java/com/github/peterbencze/serritor/internal/{ => crawldelaymechanism}/FixedCrawlDelayMechanismTest.java (95%) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 83b0498..008f7d8 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -21,11 +21,11 @@ import com.github.peterbencze.serritor.api.event.PageLoadTimeoutEvent; import com.github.peterbencze.serritor.api.event.RequestErrorEvent; import com.github.peterbencze.serritor.api.event.RequestRedirectEvent; -import com.github.peterbencze.serritor.internal.AdaptiveCrawlDelayMechanism; -import com.github.peterbencze.serritor.internal.CrawlDelayMechanism; +import com.github.peterbencze.serritor.internal.crawldelaymechanism.AdaptiveCrawlDelayMechanism; +import com.github.peterbencze.serritor.internal.crawldelaymechanism.CrawlDelayMechanism; import com.github.peterbencze.serritor.internal.CrawlFrontier; -import com.github.peterbencze.serritor.internal.FixedCrawlDelayMechanism; -import com.github.peterbencze.serritor.internal.RandomCrawlDelayMechanism; +import com.github.peterbencze.serritor.internal.crawldelaymechanism.FixedCrawlDelayMechanism; +import com.github.peterbencze.serritor.internal.crawldelaymechanism.RandomCrawlDelayMechanism; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java similarity index 97% rename from src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java rename to src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java index 1c95712..6d3926d 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.github.peterbencze.serritor.internal; +package com.github.peterbencze.serritor.internal.crawldelaymechanism; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import org.openqa.selenium.JavascriptExecutor; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java similarity index 92% rename from src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java rename to src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java index cbb4634..d788ece 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.github.peterbencze.serritor.internal; +package com.github.peterbencze.serritor.internal.crawldelaymechanism; /** * An interface which should be implemented by every crawl delay mechanism. diff --git a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java similarity index 95% rename from src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java rename to src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java index 2dcfa95..f287e8a 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.github.peterbencze.serritor.internal; +package com.github.peterbencze.serritor.internal.crawldelaymechanism; import com.github.peterbencze.serritor.api.CrawlerConfiguration; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java similarity index 96% rename from src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java rename to src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java index 6353884..cd2b035 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.github.peterbencze.serritor.internal; +package com.github.peterbencze.serritor.internal.crawldelaymechanism; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import java.util.concurrent.ThreadLocalRandom; diff --git a/src/test/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanismTest.java b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java similarity index 97% rename from src/test/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanismTest.java rename to src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java index aef99c0..98340d3 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanismTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.github.peterbencze.serritor.internal; +package com.github.peterbencze.serritor.internal.crawldelaymechanism; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import java.time.Duration; diff --git a/src/test/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanismTest.java b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java similarity index 95% rename from src/test/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanismTest.java rename to src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java index 6ac7ed4..d0a96ce 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanismTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.github.peterbencze.serritor.internal; +package com.github.peterbencze.serritor.internal.crawldelaymechanism; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import com.github.peterbencze.serritor.api.CrawlerConfiguration.CrawlerConfigurationBuilder; From a76d4d4947850f7cfd810a92624769c2242331ba Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Mon, 4 Jun 2018 00:53:13 +0200 Subject: [PATCH 12/28] Add handling for JavaScript redirects --- .../peterbencze/serritor/api/BaseCrawler.java | 105 ++++++++++-------- 1 file changed, 61 insertions(+), 44 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 008f7d8..3be6c69 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -21,9 +21,9 @@ import com.github.peterbencze.serritor.api.event.PageLoadTimeoutEvent; import com.github.peterbencze.serritor.api.event.RequestErrorEvent; import com.github.peterbencze.serritor.api.event.RequestRedirectEvent; +import com.github.peterbencze.serritor.internal.CrawlFrontier; import com.github.peterbencze.serritor.internal.crawldelaymechanism.AdaptiveCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.crawldelaymechanism.CrawlDelayMechanism; -import com.github.peterbencze.serritor.internal.CrawlFrontier; import com.github.peterbencze.serritor.internal.crawldelaymechanism.FixedCrawlDelayMechanism; import com.github.peterbencze.serritor.internal.crawldelaymechanism.RandomCrawlDelayMechanism; import java.io.IOException; @@ -205,8 +205,7 @@ private void run() { while (!stopCrawling && crawlFrontier.hasNextCandidate()) { CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); - URI candidateUrl = currentCandidate.getCandidateUrl(); - URI responseUrl = candidateUrl; + String candidateUrl = currentCandidate.getCandidateUrl().toString(); HttpClientContext context = HttpClientContext.create(); HttpResponse httpHeadResponse = null; boolean isUnsuccessfulRequest = false; @@ -223,32 +222,34 @@ private void run() { } if (!isUnsuccessfulRequest) { + String responseUrl = candidateUrl; List redirectLocations = context.getRedirectLocations(); if (redirectLocations != null) { - // If the request has been redirected, get the final URL - responseUrl = redirectLocations.get(redirectLocations.size() - 1); + // If the request was redirected, get the final URL + responseUrl = redirectLocations.get(redirectLocations.size() - 1).toString(); } if (!responseUrl.equals(candidateUrl)) { - // If the request has been redirected, a new crawl request should be created for the redirected URL - CrawlRequestBuilder builder = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()); - currentCandidate.getMetadata().ifPresent(builder::setMetadata); - CrawlRequest redirectedRequest = builder.build(); - - crawlFrontier.feedRequest(redirectedRequest, false); - onRequestRedirect(new RequestRedirectEvent(currentCandidate, redirectedRequest)); + // If the request was redirected, a new crawl request should be created for the redirected URL + handleRequestRedirect(currentCandidate, responseUrl); } else if (isContentHtml(httpHeadResponse)) { boolean isTimedOut = false; TimeoutException exception = null; try { // Open the URL in the browser - webDriver.get(candidateUrl.toString()); + webDriver.get(candidateUrl); } catch (TimeoutException exc) { isTimedOut = true; exception = exc; } + String loadedPageUrl = webDriver.getCurrentUrl(); + if (!loadedPageUrl.equals(candidateUrl)) { + // If the request was redirected (using JavaScript), a new crawl request should be created for the redirected URL + handleRequestRedirect(currentCandidate, loadedPageUrl); + } + if (!isTimedOut) { onPageLoad(new PageLoadEvent(currentCandidate, webDriver)); } else { @@ -266,6 +267,29 @@ private void run() { onStop(); } + /** + * Creates the crawl delay mechanism according to the configuration. + * + * @return the created crawl delay mechanism + */ + private CrawlDelayMechanism createCrawlDelayMechanism() { + switch (config.getCrawlDelayStrategy()) { + case FIXED: + return new FixedCrawlDelayMechanism(config); + case RANDOM: + return new RandomCrawlDelayMechanism(config); + case ADAPTIVE: + AdaptiveCrawlDelayMechanism adaptiveCrawlDelay = new AdaptiveCrawlDelayMechanism(config, (JavascriptExecutor) webDriver); + if (!adaptiveCrawlDelay.isBrowserCompatible()) { + throw new UnsupportedOperationException("The Navigation Timing API is not supported by the browser."); + } + + return adaptiveCrawlDelay; + } + + throw new IllegalArgumentException("Unsupported crawl delay strategy."); + } + /** * Sends an HTTP HEAD request to the given URL and returns the response. * @@ -274,8 +298,8 @@ private void run() { * request * @return the HTTP HEAD response */ - private HttpResponse getHttpHeadResponse(final URI destinationUrl, final HttpClientContext context) throws IOException { - HttpHead headRequest = new HttpHead(destinationUrl.toString()); + private HttpResponse getHttpHeadResponse(final String destinationUrl, final HttpClientContext context) throws IOException { + HttpHead headRequest = new HttpHead(destinationUrl); return httpClient.execute(headRequest, context); } @@ -292,38 +316,19 @@ private static boolean isContentHtml(final HttpResponse httpHeadResponse) { } /** - * Creates the crawl delay mechanism according to the configuration. + * Creates a crawl request for the redirected URL, feeds it to the crawler + * and calls the appropriate event callback. * - * @return the created crawl delay mechanism + * @param currentCrawlCandidate the current crawl candidate + * @param redirectedUrl the URL of the redirected request */ - private CrawlDelayMechanism createCrawlDelayMechanism() { - switch (config.getCrawlDelayStrategy()) { - case FIXED: - return new FixedCrawlDelayMechanism(config); - case RANDOM: - return new RandomCrawlDelayMechanism(config); - case ADAPTIVE: - AdaptiveCrawlDelayMechanism adaptiveCrawlDelay = new AdaptiveCrawlDelayMechanism(config, (JavascriptExecutor) webDriver); - if (!adaptiveCrawlDelay.isBrowserCompatible()) { - throw new UnsupportedOperationException("The Navigation Timing API is not supported by the browser."); - } - - return adaptiveCrawlDelay; - } + private void handleRequestRedirect(final CrawlCandidate currentCrawlCandidate, final String redirectedUrl) { + CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectedUrl).setPriority(currentCrawlCandidate.getPriority()); + currentCrawlCandidate.getMetadata().ifPresent(builder::setMetadata); + CrawlRequest redirectedRequest = builder.build(); - throw new IllegalArgumentException("Unsupported crawl delay strategy."); - } - - /** - * Delays the next request. - */ - private void performDelay() { - try { - TimeUnit.MILLISECONDS.sleep(crawlDelayMechanism.getDelay()); - } catch (InterruptedException ex) { - Thread.currentThread().interrupt(); - stopCrawling = true; - } + crawlFrontier.feedRequest(redirectedRequest, false); + onRequestRedirect(new RequestRedirectEvent(currentCrawlCandidate, redirectedRequest)); } /** @@ -358,6 +363,18 @@ private static BasicClientCookie convertBrowserCookie(final Cookie browserCookie return clientCookie; } + /** + * Delays the next request. + */ + private void performDelay() { + try { + TimeUnit.MILLISECONDS.sleep(crawlDelayMechanism.getDelay()); + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + stopCrawling = true; + } + } + /** * Callback which gets called when the crawler is started. */ From 9a3c1020bc4fd41ee870c0cb51675dc7d45cfa22 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Mon, 4 Jun 2018 23:51:24 +0200 Subject: [PATCH 13/28] Call page load timeout event callback in the catch clause --- .../peterbencze/serritor/api/BaseCrawler.java | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 3be6c69..da2be12 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -233,27 +233,19 @@ private void run() { // If the request was redirected, a new crawl request should be created for the redirected URL handleRequestRedirect(currentCandidate, responseUrl); } else if (isContentHtml(httpHeadResponse)) { - boolean isTimedOut = false; - TimeoutException exception = null; - try { - // Open the URL in the browser + // Open URL in browser webDriver.get(candidateUrl); - } catch (TimeoutException exc) { - isTimedOut = true; - exception = exc; + } catch (TimeoutException exception) { + onPageLoadTimeout(new PageLoadTimeoutEvent(currentCandidate, exception)); } - + String loadedPageUrl = webDriver.getCurrentUrl(); if (!loadedPageUrl.equals(candidateUrl)) { // If the request was redirected (using JavaScript), a new crawl request should be created for the redirected URL handleRequestRedirect(currentCandidate, loadedPageUrl); - } - - if (!isTimedOut) { - onPageLoad(new PageLoadEvent(currentCandidate, webDriver)); } else { - onPageLoadTimeout(new PageLoadTimeoutEvent(currentCandidate, exception)); + onPageLoad(new PageLoadEvent(currentCandidate, webDriver)); } } else { // URLs that point to non-HTML content should not be opened in the browser From c7887a3c19e7f47526a4e91082a82f00d0a907b9 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 5 Jun 2018 23:13:46 +0200 Subject: [PATCH 14/28] Fix NPE when the host is undefined --- .../com/github/peterbencze/serritor/api/helper/UrlFinder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java index 33946e2..eaafc4a 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -225,7 +225,7 @@ public UrlFinder build() { private static boolean isValidUrl(final String url) { try { return InternetDomainName.isValid(URI.create(url).getHost()); - } catch (IllegalArgumentException e) { + } catch (IllegalArgumentException | NullPointerException exc) { return false; } } From f878f8f8aa89460473493d4da1d2b3cd223309aa Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 5 Jun 2018 23:14:50 +0200 Subject: [PATCH 15/28] Update dependency and plugin versions --- pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 85de756..6501efe 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 com.github.peterbencze serritor - 1.3.2 + 1.4.0 jar Serritor @@ -59,12 +59,12 @@ org.seleniumhq.selenium htmlunit-driver - 2.30.0 + 2.31.0 com.google.guava guava - 25.0-jre + 25.1-jre junit @@ -98,7 +98,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.10.4 + 3.0.1 attach-javadoc From cfc83fd1ab0ed037ccf69898f57e7e7766c3220c Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 5 Jun 2018 23:30:54 +0200 Subject: [PATCH 16/28] Rename getCandidateUrl to getRequestUrl --- .../peterbencze/serritor/api/BaseCrawler.java | 2 +- .../serritor/api/CrawlCandidate.java | 2 +- .../api/event/NonHtmlContentEvent.java | 2 +- .../serritor/internal/CrawlFrontier.java | 2 +- .../serritor/internal/CrawlFrontierTest.java | 24 +++++++++---------- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index da2be12..971b196 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -205,7 +205,7 @@ private void run() { while (!stopCrawling && crawlFrontier.hasNextCandidate()) { CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); - String candidateUrl = currentCandidate.getCandidateUrl().toString(); + String candidateUrl = currentCandidate.getRequestUrl().toString(); HttpClientContext context = HttpClientContext.create(); HttpResponse httpHeadResponse = null; boolean isUnsuccessfulRequest = false; diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java index c219c0e..48ed647 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java @@ -51,7 +51,7 @@ public URI getRefererUrl() { * * @return the URL of the request */ - public URI getCandidateUrl() { + public URI getRequestUrl() { return crawlRequest.getRequestUrl(); } diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java index 7fc4670..3fe89e6 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java @@ -45,6 +45,6 @@ public NonHtmlContentEvent(final CrawlCandidate crawlCandidate) { * downloading the file */ public void downloadFile(final File destination) throws IOException { - FileUtils.copyURLToFile(getCrawlCandidate().getCandidateUrl().toURL(), destination); + FileUtils.copyURLToFile(getCrawlCandidate().getRequestUrl().toURL(), destination); } } diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index af7c839..0c47bb8 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -116,7 +116,7 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { return; } - builder = new CrawlCandidateBuilder(request).setRefererUrl(currentCandidate.getCandidateUrl()) + builder = new CrawlCandidateBuilder(request).setRefererUrl(currentCandidate.getRequestUrl()) .setCrawlDepth(nextCrawlDepth); } else { builder = new CrawlCandidateBuilder(request); diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index 0413ffa..a477d51 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -182,7 +182,7 @@ public void testDisabledDuplicateRequestFiltering() { Assert.assertTrue(frontier.hasNextCandidate()); // Check if the URLs match - Assert.assertEquals(DUPLICATE_ROOT_URL_0, frontier.getNextCandidate().getCandidateUrl()); + Assert.assertEquals(DUPLICATE_ROOT_URL_0, frontier.getNextCandidate().getRequestUrl()); } @Test @@ -212,7 +212,7 @@ public void testDisabledOffsiteRequestFiltering() { Assert.assertTrue(frontier.hasNextCandidate()); // Check if the URLs match - Assert.assertEquals(OFFSITE_URL.toString(), frontier.getNextCandidate().getCandidateUrl().toString()); + Assert.assertEquals(OFFSITE_URL.toString(), frontier.getNextCandidate().getRequestUrl().toString()); } @Test @@ -221,7 +221,7 @@ public void testGetNextCandidateUsingBreadthFirstCrawlStrategy() { CrawlCandidate nextCandidate = frontier.getNextCandidate(); // Check the URL of this candidate, it should be root URL 1. - Assert.assertEquals(ROOT_URL_1, nextCandidate.getCandidateUrl()); + Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl()); // Check the crawl depth of this candidate, it should be 0 because it is a root URL. Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); @@ -236,7 +236,7 @@ public void testGetNextCandidateUsingBreadthFirstCrawlStrategy() { nextCandidate = frontier.getNextCandidate(); // Check the URL of this candidate, it should be root URL 0. - Assert.assertEquals(ROOT_URL_0, nextCandidate.getCandidateUrl()); + Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl()); // Check the crawl depth of this candidate, it should be 0 again because it is also a root URL. Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); @@ -252,7 +252,7 @@ public void testGetNextCandidateUsingBreadthFirstCrawlStrategy() { nextCandidate = frontier.getNextCandidate(); // Check the URL of this candidate, it should be child URL 2. - Assert.assertEquals(CHILD_URL_2.toString(), nextCandidate.getCandidateUrl().toString()); + Assert.assertEquals(CHILD_URL_2.toString(), nextCandidate.getRequestUrl().toString()); // Check the crawl depth of this candidate, it should be 1 because it is a child URL that comes from root URL 1. Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); @@ -265,7 +265,7 @@ public void testGetNextCandidateUsingBreadthFirstCrawlStrategy() { nextCandidate = frontier.getNextCandidate(); // Check the URL of this request, it should be a child URL. - Assert.assertTrue(nextCandidate.getCandidateUrl().toString().contains(CHILD_URL_PATH)); + Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH)); // Check the crawl depth of this candidate, it should be 1 again because it is a child URL that comes from root URL 0. Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); @@ -277,7 +277,7 @@ public void testGetNextCandidateUsingBreadthFirstCrawlStrategy() { nextCandidate = frontier.getNextCandidate(); // Check the URL of this candidate, it should be a child URL. - Assert.assertTrue(nextCandidate.getCandidateUrl().toString().contains(CHILD_URL_PATH)); + Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH)); // Check the crawl depth of this candidate, it should be 1 again becaise it is another child URL that also comes from root URL 0. Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); @@ -301,7 +301,7 @@ public void testGetNextCandidateUsingDepthFirstCrawlStrategy() { CrawlCandidate nextCandidate = frontier.getNextCandidate(); // Check the URL of this candidate, it should be root URL 1 - Assert.assertEquals(ROOT_URL_1, nextCandidate.getCandidateUrl()); + Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl()); // Check the crawl depth of this candidate, it should be 0 because it is a root URL Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); @@ -317,7 +317,7 @@ public void testGetNextCandidateUsingDepthFirstCrawlStrategy() { nextCandidate = frontier.getNextCandidate(); // Check the URL of this candidate, it should be a child URL - Assert.assertTrue(nextCandidate.getCandidateUrl().toString().contains(CHILD_URL_PATH)); + Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH)); // Check the crawl depth of this candidate, it should be 1 because it is a child URL that comes from root URL 1 Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); @@ -329,7 +329,7 @@ public void testGetNextCandidateUsingDepthFirstCrawlStrategy() { nextCandidate = frontier.getNextCandidate(); // Check the URL of this candidate, it should be root URL 0 - Assert.assertEquals(ROOT_URL_0, nextCandidate.getCandidateUrl()); + Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl()); // Check the crawl depth of this candidate, it should be 0 again because it is also a root URL Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); @@ -345,7 +345,7 @@ public void testGetNextCandidateUsingDepthFirstCrawlStrategy() { nextCandidate = frontier.getNextCandidate(); // Check the URL of this candidate, it should be child URL 0 - Assert.assertEquals(CHILD_URL_0.toString(), nextCandidate.getCandidateUrl().toString()); + Assert.assertEquals(CHILD_URL_0.toString(), nextCandidate.getRequestUrl().toString()); // Check the crawl depth of this candidate, it should be 1 again because it is a child URL that comes from root URL 0 Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); @@ -357,7 +357,7 @@ public void testGetNextCandidateUsingDepthFirstCrawlStrategy() { nextCandidate = frontier.getNextCandidate(); // Check the URL of this candidate, it should be child URL 1 - Assert.assertEquals(CHILD_URL_1.toString(), nextCandidate.getCandidateUrl().toString()); + Assert.assertEquals(CHILD_URL_1.toString(), nextCandidate.getRequestUrl().toString()); // Check the crawl depth of this candidate, it should be 1 again becaise it is a child URL that also comes from root URL 0 Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); From 34eaaa7bdae0e42ae3a7b45cda21bdc2b2393b9f Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Mon, 11 Jun 2018 22:07:13 +0200 Subject: [PATCH 17/28] Add checkstyle plugin, reformat code to comply with the style guide --- checkstyle.xml | 255 ++++++++++++++++++ pom.xml | 23 ++ .../peterbencze/serritor/api/BaseCrawler.java | 95 ++++--- .../serritor/api/CrawlCandidate.java | 8 +- .../serritor/api/CrawlDelayStrategy.java | 10 +- .../serritor/api/CrawlRequest.java | 12 +- .../serritor/api/CrawlStrategy.java | 6 +- .../serritor/api/CrawlerConfiguration.java | 88 +++--- .../api/event/NonHtmlContentEvent.java | 8 +- .../serritor/api/event/PageLoadEvent.java | 5 +- .../api/event/PageLoadTimeoutEvent.java | 21 +- .../serritor/api/event/RequestErrorEvent.java | 5 +- .../api/event/RequestRedirectEvent.java | 8 +- .../serritor/api/helper/UrlFinder.java | 28 +- .../serritor/internal/CrawlDomain.java | 12 +- .../serritor/internal/CrawlFrontier.java | 55 ++-- .../serritor/internal/EventObject.java | 3 +- .../AdaptiveCrawlDelayMechanism.java | 38 +-- .../CrawlDelayMechanism.java | 3 +- .../FixedCrawlDelayMechanism.java | 10 +- .../RandomCrawlDelayMechanism.java | 13 +- .../serritor/api/helper/UrlFinderTest.java | 7 +- .../serritor/internal/CrawlDomainTest.java | 13 +- .../serritor/internal/CrawlFrontierTest.java | 156 +++-------- .../AdaptiveCrawlDelayMechanismTest.java | 15 +- .../FixedCrawlDelayMechanismTest.java | 7 +- 26 files changed, 573 insertions(+), 331 deletions(-) create mode 100644 checkstyle.xml diff --git a/checkstyle.xml b/checkstyle.xml new file mode 100644 index 0000000..52ef575 --- /dev/null +++ b/checkstyle.xml @@ -0,0 +1,255 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pom.xml b/pom.xml index 6501efe..d57489c 100644 --- a/pom.xml +++ b/pom.xml @@ -108,6 +108,29 @@ + + org.apache.maven.plugins + maven-checkstyle-plugin + 3.0.0 + + + com.puppycrawl.tools + checkstyle + 8.10.1 + + + + true + checkstyle.xml + + + + + check + + + + org.apache.maven.plugins maven-gpg-plugin diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 971b196..e200a84 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api; import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder; @@ -50,8 +51,8 @@ import org.openqa.selenium.htmlunit.HtmlUnitDriver; /** - * Provides a skeletal implementation of a crawler to minimize the effort for - * users to implement their own. + * Provides a skeletal implementation of a crawler to minimize the effort for users to implement + * their own. * * @author Peter Bencze */ @@ -87,11 +88,9 @@ public final void start() { } /** - * Starts the crawler using the browser specified by the given - * WebDriver instance. + * Starts the crawler using the browser specified by the given WebDriver instance. * - * @param webDriver the WebDriver instance to control the - * browser + * @param webDriver the WebDriver instance to control the browser */ public final void start(final WebDriver webDriver) { start(webDriver, new CrawlFrontier(config)); @@ -100,8 +99,8 @@ public final void start(final WebDriver webDriver) { /** * Initializes and runs the crawler. * - * @param crawlFrontier the CrawlFrontier instance to be used - * by the crawler to manage crawl requests + * @param crawlFrontier the CrawlFrontier instance to be used by the crawler to + * manage crawl requests */ private void start(final WebDriver webDriver, final CrawlFrontier crawlFrontier) { try { @@ -133,7 +132,8 @@ private void start(final WebDriver webDriver, final CrawlFrontier crawlFrontier) */ public final void saveState(final OutputStream out) { // Check if the crawler has been started at least once, otherwise we have nothing to save - Validate.validState(crawlFrontier != null, "Cannot save state at this point. The crawler should be started first."); + Validate.validState(crawlFrontier != null, + "Cannot save state at this point. The crawler should be started first."); // Save the crawl frontier's current state SerializationUtils.serialize(crawlFrontier, out); @@ -152,9 +152,8 @@ public final void resumeState(final InputStream in) { * Resumes a previously saved state using the browser specified by the given * WebDriver instance. * - * @param webDriver the WebDriver instance to control the - * browser - * @param in the input stream from which the state should be loaded + * @param webDriver the WebDriver instance to control the browser + * @param in the input stream from which the state should be loaded */ public final void resumeState(final WebDriver webDriver, final InputStream in) { // Re-create crawl frontier from the saved state @@ -175,21 +174,22 @@ public final void stop() { } /** - * Feeds a crawl request to the crawler. The crawler should be running, - * otherwise the request has to be added as a crawl seed instead. + * Feeds a crawl request to the crawler. The crawler should be running, otherwise the request + * has to be added as a crawl seed instead. * * @param request the crawl request */ protected final void crawl(final CrawlRequest request) { Validate.notNull(request, "The request cannot be null."); - Validate.validState(!isStopped, "The crawler is not started. Maybe you meant to add this request as a crawl seed?"); + Validate.validState(!isStopped, + "The crawler is not started. Maybe you meant to add this request as a crawl seed?"); crawlFrontier.feedRequest(request, false); } /** - * Feeds multiple crawl requests to the crawler. The crawler should be - * running, otherwise the requests have to be added as crawl seeds instead. + * Feeds multiple crawl requests to the crawler. The crawler should be running, otherwise the + * requests have to be added as crawl seeds instead. * * @param requests the list of crawl requests */ @@ -210,11 +210,11 @@ private void run() { HttpResponse httpHeadResponse = null; boolean isUnsuccessfulRequest = false; - // Update the client's cookie store, so it will have the same state as the browser. + // Update the client's cookie store, so it will have the same state as the browser updateClientCookieStore(); try { - // Send an HTTP HEAD request to the current URL to determine its availability and content type + // Send an HTTP HEAD request to determine its availability and content type httpHeadResponse = getHttpHeadResponse(candidateUrl, context); } catch (IOException exception) { onRequestError(new RequestErrorEvent(currentCandidate, exception)); @@ -230,7 +230,7 @@ private void run() { } if (!responseUrl.equals(candidateUrl)) { - // If the request was redirected, a new crawl request should be created for the redirected URL + // Create a new crawl request for the redirected URL handleRequestRedirect(currentCandidate, responseUrl); } else if (isContentHtml(httpHeadResponse)) { try { @@ -239,10 +239,10 @@ private void run() { } catch (TimeoutException exception) { onPageLoadTimeout(new PageLoadTimeoutEvent(currentCandidate, exception)); } - + String loadedPageUrl = webDriver.getCurrentUrl(); if (!loadedPageUrl.equals(candidateUrl)) { - // If the request was redirected (using JavaScript), a new crawl request should be created for the redirected URL + // Create a new crawl request for the redirected URL (JavaScript redirect) handleRequestRedirect(currentCandidate, loadedPageUrl); } else { onPageLoad(new PageLoadEvent(currentCandidate, webDriver)); @@ -264,6 +264,7 @@ private void run() { * * @return the created crawl delay mechanism */ + @SuppressWarnings("checkstyle:MissingSwitchDefault") private CrawlDelayMechanism createCrawlDelayMechanism() { switch (config.getCrawlDelayStrategy()) { case FIXED: @@ -271,12 +272,14 @@ private CrawlDelayMechanism createCrawlDelayMechanism() { case RANDOM: return new RandomCrawlDelayMechanism(config); case ADAPTIVE: - AdaptiveCrawlDelayMechanism adaptiveCrawlDelay = new AdaptiveCrawlDelayMechanism(config, (JavascriptExecutor) webDriver); - if (!adaptiveCrawlDelay.isBrowserCompatible()) { - throw new UnsupportedOperationException("The Navigation Timing API is not supported by the browser."); + AdaptiveCrawlDelayMechanism mechanism + = new AdaptiveCrawlDelayMechanism(config, (JavascriptExecutor) webDriver); + if (!mechanism.isBrowserCompatible()) { + throw new UnsupportedOperationException("The Navigation Timing API is not " + + "supported by the browser."); } - return adaptiveCrawlDelay; + return mechanism; } throw new IllegalArgumentException("Unsupported crawl delay strategy."); @@ -286,11 +289,14 @@ private CrawlDelayMechanism createCrawlDelayMechanism() { * Sends an HTTP HEAD request to the given URL and returns the response. * * @param destinationUrl the destination URL - * @throws IOException if an error occurs while trying to fulfill the - * request + * * @return the HTTP HEAD response + * + * @throws IOException if an error occurs while trying to fulfill the request */ - private HttpResponse getHttpHeadResponse(final String destinationUrl, final HttpClientContext context) throws IOException { + private HttpResponse getHttpHeadResponse( + final String destinationUrl, + final HttpClientContext context) throws IOException { HttpHead headRequest = new HttpHead(destinationUrl); return httpClient.execute(headRequest, context); } @@ -299,8 +305,8 @@ private HttpResponse getHttpHeadResponse(final String destinationUrl, final Http * Indicates if the response's content type is HTML. * * @param httpHeadResponse the HTTP HEAD response - * @return true if the content type is HTML, false - * otherwise + * + * @return true if the content type is HTML, false otherwise */ private static boolean isContentHtml(final HttpResponse httpHeadResponse) { Header contentTypeHeader = httpHeadResponse.getFirstHeader("Content-Type"); @@ -308,14 +314,17 @@ private static boolean isContentHtml(final HttpResponse httpHeadResponse) { } /** - * Creates a crawl request for the redirected URL, feeds it to the crawler - * and calls the appropriate event callback. + * Creates a crawl request for the redirected URL, feeds it to the crawler and calls the + * appropriate event callback. * * @param currentCrawlCandidate the current crawl candidate - * @param redirectedUrl the URL of the redirected request + * @param redirectedUrl the URL of the redirected request */ - private void handleRequestRedirect(final CrawlCandidate currentCrawlCandidate, final String redirectedUrl) { - CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectedUrl).setPriority(currentCrawlCandidate.getPriority()); + private void handleRequestRedirect( + final CrawlCandidate currentCrawlCandidate, + final String redirectedUrl) { + CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectedUrl) + .setPriority(currentCrawlCandidate.getPriority()); currentCrawlCandidate.getMetadata().ifPresent(builder::setMetadata); CrawlRequest redirectedRequest = builder.build(); @@ -324,8 +333,8 @@ private void handleRequestRedirect(final CrawlCandidate currentCrawlCandidate, f } /** - * Adds all the browser cookies for the current domain to the HTTP client's - * cookie store, replacing any existing equivalent ones. + * Adds all the browser cookies for the current domain to the HTTP client's cookie store, + * replacing any existing equivalent ones. */ private void updateClientCookieStore() { webDriver.manage() @@ -339,10 +348,12 @@ private void updateClientCookieStore() { * Converts a browser cookie to a HTTP client one. * * @param browserCookie the browser cookie to be converted + * * @return the converted HTTP client cookie */ private static BasicClientCookie convertBrowserCookie(final Cookie browserCookie) { - BasicClientCookie clientCookie = new BasicClientCookie(browserCookie.getName(), browserCookie.getValue()); + BasicClientCookie clientCookie + = new BasicClientCookie(browserCookie.getName(), browserCookie.getValue()); clientCookie.setDomain(browserCookie.getDomain()); clientCookie.setPath(browserCookie.getPath()); clientCookie.setExpiryDate(browserCookie.getExpiry()); @@ -406,8 +417,8 @@ protected void onRequestRedirect(final RequestRedirectEvent event) { } /** - * Callback which gets called when the page does not load in the browser - * within the timeout period. + * Callback which gets called when the page does not load in the browser within the timeout + * period. * * @param event the PageLoadTimeoutEvent instance */ diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java index 48ed647..9b238d7 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api; import com.google.common.net.InternetDomainName; @@ -104,8 +105,7 @@ public static final class CrawlCandidateBuilder { /** * Creates a {@link CrawlCandidateBuilder} instance. * - * @param request the CrawlRequest instance from which this - * candidate is built + * @param request the CrawlRequest instance from which this candidate is built */ public CrawlCandidateBuilder(final CrawlRequest request) { crawlRequest = request; @@ -115,6 +115,7 @@ public CrawlCandidateBuilder(final CrawlRequest request) { * Sets the referer URL. * * @param refererUrl the referer URL + * * @return the CrawlCandidateBuilder instance */ public CrawlCandidateBuilder setRefererUrl(final URI refererUrl) { @@ -126,6 +127,7 @@ public CrawlCandidateBuilder setRefererUrl(final URI refererUrl) { * Sets the crawl depth of the request. * * @param crawlDepth the crawl depth of the request + * * @return the CrawlCandidateBuilder instance */ public CrawlCandidateBuilder setCrawlDepth(final int crawlDepth) { diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java index a7bc47b..4a80d8b 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,16 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api; /** - * Available crawl delay strategies which define how the delay between each - * request is determined. - * + * Available crawl delay strategies which define how the delay between each request is determined. + * * @author Peter Bencze */ public enum CrawlDelayStrategy { - + FIXED, ADAPTIVE, RANDOM diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java index 08038f2..f1c6e4a 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api; import com.google.common.net.InternetDomainName; @@ -24,8 +25,8 @@ import org.apache.commons.lang3.Validate; /** - * Represents a crawl request that may be completed by the crawler. If request - * filtering is enabled, it could get filtered out. + * Represents a crawl request that may be completed by the crawler. If request filtering is enabled, + * it could get filtered out. * * @author Peter Bencze */ @@ -120,8 +121,8 @@ public CrawlRequestBuilder(final String requestUrl) { /** * Sets the priority of the request. * - * @param priority the priority of the request (higher number means - * higher priority) + * @param priority the priority of the request (higher number means higher priority) + * * @return the CrawlRequestBuilder instance */ public CrawlRequestBuilder setPriority(final int priority) { @@ -133,6 +134,7 @@ public CrawlRequestBuilder setPriority(final int priority) { * Sets the metadata associated with the request. * * @param metadata the metadata associated with the request + * * @return the CrawlRequestBuilder instance */ public CrawlRequestBuilder setMetadata(final Serializable metadata) { diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java index c505932..e449892 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,11 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api; /** - * Available crawl strategies that define the order in which crawl requests are - * processed. + * Available crawl strategies that define the order in which crawl requests are processed. * * @author Peter Bencze */ diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java index bcc2d25..d5aef15 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java +++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api; import com.github.peterbencze.serritor.internal.CrawlDomain; @@ -155,9 +156,12 @@ public static final class CrawlerConfigurationBuilder { private static final boolean FILTER_OFFSITE_REQUESTS_BY_DEFAULT = false; private static final int DEFAULT_MAX_CRAWL_DEPTH = 0; private static final CrawlDelayStrategy DEFAULT_CRAWL_DELAY = CrawlDelayStrategy.FIXED; - private static final long DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS = Duration.ZERO.toMillis(); - private static final long DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS = Duration.ofSeconds(1).toMillis(); - private static final long DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS = Duration.ofMinutes(1).toMillis(); + private static final long DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS + = Duration.ZERO.toMillis(); + private static final long DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS + = Duration.ofSeconds(1).toMillis(); + private static final long DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS + = Duration.ofMinutes(1).toMillis(); private final Set allowedCrawlDomains; private final Set crawlSeeds; @@ -192,26 +196,29 @@ public CrawlerConfigurationBuilder() { * Appends an internet domain to the list of allowed crawl domains. * * @param allowedCrawlDomain a well-formed internet domain name + * * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder addAllowedCrawlDomain(final String allowedCrawlDomain) { InternetDomainName domain = InternetDomainName.from(allowedCrawlDomain); - Validate.isTrue(domain.isUnderPublicSuffix(), String.format("The domain (\"%s\") is not under public suffix.", allowedCrawlDomain)); + Validate.isTrue(domain.isUnderPublicSuffix(), + String.format("The domain (\"%s\") is not under public suffix.", + allowedCrawlDomain)); allowedCrawlDomains.add(new CrawlDomain(domain)); return this; } /** - * Appends a list of internet domains to the list of allowed crawl - * domains. + * Appends a list of internet domains to the list of allowed crawl domains. + * + * @param allowedCrawlDomains a list of well-formed internet domain names * - * @param allowedCrawlDomains a list of well-formed internet domain - * names * @return the CrawlerConfigurationBuilder instance */ - public CrawlerConfigurationBuilder addAllowedCrawlDomains(final List allowedCrawlDomains) { + public CrawlerConfigurationBuilder addAllowedCrawlDomains( + final List allowedCrawlDomains) { allowedCrawlDomains.forEach(this::addAllowedCrawlDomain); return this; } @@ -220,6 +227,7 @@ public CrawlerConfigurationBuilder addAllowedCrawlDomains(final List all * Appends a crawl request to the set of crawl seeds. * * @param request the crawl request which represents a crawl seed + * * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder addCrawlSeed(final CrawlRequest request) { @@ -232,8 +240,8 @@ public CrawlerConfigurationBuilder addCrawlSeed(final CrawlRequest request) { /** * Appends a list of crawl requests to the set of crawl seeds. * - * @param requests the list of crawl requests which represent crawl - * seeds + * @param requests the list of crawl requests which represent crawl seeds + * * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder addCrawlSeeds(final List requests) { @@ -242,11 +250,12 @@ public CrawlerConfigurationBuilder addCrawlSeeds(final List reques } /** - * Sets the crawl strategy to be used by the crawler. Breadth-first - * strategy orders crawl requests by the lowest crawl depth, whereas - * depth-first orders them by the highest crawl depth. + * Sets the crawl strategy to be used by the crawler. Breadth-first strategy orders crawl + * requests by the lowest crawl depth, whereas depth-first orders them by the highest crawl + * depth. * * @param strategy the crawl strategy + * * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder setCrawlStrategy(final CrawlStrategy strategy) { @@ -259,11 +268,13 @@ public CrawlerConfigurationBuilder setCrawlStrategy(final CrawlStrategy strategy /** * Enables or disables duplicate request filtering. * - * @param filterDuplicateRequests true means enabled, - * false means disabled + * @param filterDuplicateRequests true means enabled, false means + * disabled + * * @return the CrawlerConfigurationBuilder instance */ - public CrawlerConfigurationBuilder setDuplicateRequestFiltering(final boolean filterDuplicateRequests) { + public CrawlerConfigurationBuilder setDuplicateRequestFiltering( + final boolean filterDuplicateRequests) { this.filterDuplicateRequests = filterDuplicateRequests; return this; } @@ -271,20 +282,22 @@ public CrawlerConfigurationBuilder setDuplicateRequestFiltering(final boolean fi /** * Enables or disables offsite request filtering. * - * @param filterOffsiteRequests true means enabled, - * false means disabled + * @param filterOffsiteRequests true means enabled, false means + * disabled + * * @return the CrawlerConfigurationBuilder instance */ - public CrawlerConfigurationBuilder setOffsiteRequestFiltering(final boolean filterOffsiteRequests) { + public CrawlerConfigurationBuilder setOffsiteRequestFiltering( + final boolean filterOffsiteRequests) { this.filterOffsiteRequests = filterOffsiteRequests; return this; } /** - * Sets the maximum crawl depth. It should be a non-negative - * number (0 means no limit). + * Sets the maximum crawl depth. It should be a non-negative number (0 means no limit). * * @param maxCrawlDepth the maximum crawl depth + * * @return the CrawlerConfigurationBuilder instance */ public CrawlerConfigurationBuilder setMaximumCrawlDepth(final int maxCrawlDepth) { @@ -295,13 +308,15 @@ public CrawlerConfigurationBuilder setMaximumCrawlDepth(final int maxCrawlDepth) } /** - * Sets the crawl delay strategy to be used by the crawler. This - * strategy defines how the delay between each request is determined. + * Sets the crawl delay strategy to be used by the crawler. This strategy defines how the + * delay between each request is determined. * * @param strategy the crawl delay strategy + * * @return the CrawlerConfigurationBuilder instance */ - public CrawlerConfigurationBuilder setCrawlDelayStrategy(final CrawlDelayStrategy strategy) { + public CrawlerConfigurationBuilder setCrawlDelayStrategy( + final CrawlDelayStrategy strategy) { Validate.notNull(strategy, "The strategy cannot be null."); crawlDelayStrategy = strategy; @@ -312,9 +327,11 @@ public CrawlerConfigurationBuilder setCrawlDelayStrategy(final CrawlDelayStrateg * Sets the exact duration of delay between each request. * * @param fixedCrawlDelayDuration the duration of delay + * * @return the CrawlerConfigurationBuilder instance */ - public CrawlerConfigurationBuilder setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) { + public CrawlerConfigurationBuilder setFixedCrawlDelayDuration( + final Duration fixedCrawlDelayDuration) { Validate.notNull(fixedCrawlDelayDuration, "The duration cannot be null."); fixedCrawlDelayDurationInMillis = fixedCrawlDelayDuration.toMillis(); @@ -325,15 +342,19 @@ public CrawlerConfigurationBuilder setFixedCrawlDelayDuration(final Duration fix * Sets the minimum duration of delay between each request. * * @param minCrawlDelayDuration the minimum duration of delay + * * @return the CrawlerConfigurationBuilder instance */ - public CrawlerConfigurationBuilder setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) { + public CrawlerConfigurationBuilder setMinimumCrawlDelayDuration( + final Duration minCrawlDelayDuration) { Validate.notNull(minCrawlDelayDuration, "The duration cannot be null."); - Validate.isTrue(!minCrawlDelayDuration.isNegative(), "The minimum crawl delay cannot be negative."); + Validate.isTrue(!minCrawlDelayDuration.isNegative(), + "The minimum crawl delay cannot be negative."); long minDelayDurationInMillis = minCrawlDelayDuration.toMillis(); - Validate.isTrue(minDelayDurationInMillis < maxCrawlDelayDurationInMillis, "The minimum crawl delay should be less than the maximum."); + Validate.isTrue(minDelayDurationInMillis < maxCrawlDelayDurationInMillis, + "The minimum crawl delay should be less than the maximum."); minCrawlDelayDurationInMillis = minDelayDurationInMillis; return this; @@ -343,14 +364,17 @@ public CrawlerConfigurationBuilder setMinimumCrawlDelayDuration(final Duration m * Sets the maximum duration of delay between each request. * * @param maxCrawlDelayDuration the maximum duration of delay + * * @return the CrawlerConfigurationBuilder instance */ - public CrawlerConfigurationBuilder setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) { + public CrawlerConfigurationBuilder setMaximumCrawlDelayDuration( + final Duration maxCrawlDelayDuration) { Validate.notNull(maxCrawlDelayDuration, "The duration cannot be null."); long maxDelayDurationInMillis = maxCrawlDelayDuration.toMillis(); - Validate.isTrue(maxDelayDurationInMillis > minCrawlDelayDurationInMillis, "The maximum crawl delay should be higher than the minimum."); + Validate.isTrue(maxDelayDurationInMillis > minCrawlDelayDurationInMillis, + "The maximum crawl delay should be higher than the minimum."); maxCrawlDelayDurationInMillis = maxDelayDurationInMillis; return this; diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java index 3fe89e6..932df11 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api.event; import com.github.peterbencze.serritor.api.CrawlCandidate; @@ -41,8 +42,9 @@ public NonHtmlContentEvent(final CrawlCandidate crawlCandidate) { * Downloads the file specified by the URL. * * @param destination the destination file - * @throws IOException if the URL cannot be opened or I/O error occurs while - * downloading the file + * + * @throws IOException if the URL cannot be opened or I/O error occurs while downloading the + * file */ public void downloadFile(final File destination) throws IOException { FileUtils.copyURLToFile(getCrawlCandidate().getRequestUrl().toURL(), destination); diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java index cd3726b..d83e394 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api.event; import com.github.peterbencze.serritor.api.CrawlCandidate; @@ -32,7 +33,7 @@ public final class PageLoadEvent extends EventObject { * Creates a {@link PageLoadEvent} instance. * * @param crawlCandidate the current crawl candidate - * @param webDriver the WebDriver to control the browser + * @param webDriver the WebDriver to control the browser */ public PageLoadEvent(final CrawlCandidate crawlCandidate, final WebDriver webDriver) { super(crawlCandidate); diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java index a2b88b8..e6c3e16 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api.event; import com.github.peterbencze.serritor.api.CrawlCandidate; @@ -20,30 +21,30 @@ import org.openqa.selenium.TimeoutException; /** - * Event which gets delivered when a page does not load in the browser within - * the timeout period. + * Event which gets delivered when a page does not load in the browser within the timeout period. * * @author Peter Bencze */ public final class PageLoadTimeoutEvent extends EventObject { - + private final TimeoutException exception; - + /** * Creates a {@link PageLoadTimeoutEvent} instance. - * + * * @param crawlCandidate the current crawl candidate - * @param exception the thrown exception + * @param exception the thrown exception */ - public PageLoadTimeoutEvent(final CrawlCandidate crawlCandidate, final TimeoutException exception) { + public PageLoadTimeoutEvent(final CrawlCandidate crawlCandidate, + final TimeoutException exception) { super(crawlCandidate); - + this.exception = exception; } /** * Returns the thrown exception. - * + * * @return the thrown exception */ public TimeoutException getException() { diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java index 305840b..bbce9b3 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api.event; import com.github.peterbencze.serritor.api.CrawlCandidate; @@ -32,7 +33,7 @@ public final class RequestErrorEvent extends EventObject { * Creates a {@link RequestErrorEvent} instance. * * @param crawlCandidate the current crawl candidate - * @param exception the thrown exception + * @param exception the thrown exception */ public RequestErrorEvent(final CrawlCandidate crawlCandidate, final IOException exception) { super(crawlCandidate); diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java index d142d0a..188ba3e 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java +++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api.event; import com.github.peterbencze.serritor.api.CrawlCandidate; @@ -31,10 +32,11 @@ public final class RequestRedirectEvent extends EventObject { /** * Creates a {@link RequestRedirectEvent} instance. * - * @param crawlCandidate the current crawl candidate + * @param crawlCandidate the current crawl candidate * @param redirectedCrawlRequest the crawl request for the redirected URL */ - public RequestRedirectEvent(final CrawlCandidate crawlCandidate, final CrawlRequest redirectedCrawlRequest) { + public RequestRedirectEvent(final CrawlCandidate crawlCandidate, + final CrawlRequest redirectedCrawlRequest) { super(crawlCandidate); this.redirectedCrawlRequest = redirectedCrawlRequest; diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java index eaafc4a..65ec266 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java +++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api.helper; import com.github.peterbencze.serritor.api.event.PageLoadEvent; @@ -56,6 +57,7 @@ private UrlFinder(final UrlFinderBuilder builder) { * Returns a list of validated URLs found in the page's HTML source. * * @param event the PageLoadEvent instance + * * @return the list of found URLs */ public List findUrlsInPage(final PageLoadEvent event) { @@ -85,6 +87,7 @@ public List findUrlsInPage(final PageLoadEvent event) { * Returns a list of validated URLs found in the attribute's value. * * @param attributeValue the value of the attribute + * * @return the list of found URLs */ private List findUrlsInAttributeValue(final String attributeValue) { @@ -144,11 +147,11 @@ public UrlFinderBuilder(final List urlPatterns) { } /** - * Sets the locating mechanism used by the finder. Only elements matched - * by the locator will be considered when searching for URLs. + * Sets the locating mechanism used by the finder. Only elements matched by the locator will + * be considered when searching for URLs. + * + * @param locatingMechanism the By locating mechanism instance * - * @param locatingMechanism the By locating mechanism - * instance * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setLocatingMechanism(final By locatingMechanism) { @@ -156,11 +159,11 @@ public UrlFinderBuilder setLocatingMechanism(final By locatingMechanism) { } /** - * Sets the locating mechanisms used by the finder. Only elements - * matched by the locators will be considered when searching for URLs. + * Sets the locating mechanisms used by the finder. Only elements matched by the locators + * will be considered when searching for URLs. + * + * @param locatingMechanisms the list of By locating mechanism instances * - * @param locatingMechanisms the list of By locating - * mechanism instances * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setLocatingMechanisms(final List locatingMechanisms) { @@ -174,6 +177,7 @@ public UrlFinderBuilder setLocatingMechanisms(final List locatingMechanisms) * Sets the list of attribute names to search for URLs. * * @param attributes the list of attribute names + * * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setAttributes(final List attributes) { @@ -187,6 +191,7 @@ public UrlFinderBuilder setAttributes(final List attributes) { * Sets the attribute name to search for URLs. * * @param attribute the attribute name + * * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setAttribute(final String attribute) { @@ -197,6 +202,7 @@ public UrlFinderBuilder setAttribute(final String attribute) { * Sets a predicate to be used for validating found URLs. * * @param validator the validator predicate + * * @return the UrlFinderBuilder instance */ public UrlFinderBuilder setValidator(final Predicate validator) { @@ -219,8 +225,8 @@ public UrlFinder build() { * The default URL validator function. * * @param url the URL to validate - * @return true if the URL is valid, false - * otherwise + * + * @return true if the URL is valid, false otherwise */ private static boolean isValidUrl(final String url) { try { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java index 7fb007e..3fec9fa 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.internal; import com.google.common.collect.ImmutableList; @@ -38,10 +39,11 @@ public CrawlDomain(final InternetDomainName domain) { } /** - * Indicates if two CrawlDomain instances are equal. - * Crawl domains with the same domain name are considered equal. + * Indicates if two CrawlDomain instances are equal. Crawl domains with the same + * domain name are considered equal. * * @param obj a CrawlDomain instance + * * @return true if equal, false otherwise */ @Override @@ -59,8 +61,7 @@ public boolean equals(final Object obj) { } /** - * Calculates the hash code from the individual components of the domain - * name. + * Calculates the hash code from the individual components of the domain name. * * @return the hash code for the crawl domain */ @@ -73,6 +74,7 @@ public int hashCode() { * Indicates if this crawl domain contains the specific internet domain. * * @param domain an immutable well-formed internet domain name + * * @return true if belongs, false otherwise */ public boolean contains(final InternetDomainName domain) { diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index 0c47bb8..f18207b 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.internal; import com.github.peterbencze.serritor.api.CrawlCandidate; @@ -54,15 +55,10 @@ public final class CrawlFrontier implements Serializable { */ public CrawlFrontier(final CrawlerConfiguration config) { this.config = config; - allowedCrawlDomains = config.getAllowedCrawlDomains(); - urlFingerprints = new HashSet<>(); - - // Construct a priority queue according to the crawl strategy specified in the configuration candidates = createPriorityQueue(); - // Feed initial crawl requests (seeds) config.getCrawlSeeds() .forEach((CrawlRequest request) -> { feedRequest(request, true); @@ -72,13 +68,11 @@ public CrawlFrontier(final CrawlerConfiguration config) { /** * Feeds a crawl request to the frontier. * - * @param request the crawl request + * @param request the crawl request * @param isCrawlSeed indicates if the request is a crawl seed */ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { if (config.isOffsiteRequestFilteringEnabled()) { - // Check if the request's domain is in the allowed crawl domains - boolean inCrawlDomain = false; for (CrawlDomain allowedCrawlDomain : allowedCrawlDomains) { @@ -94,8 +88,6 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { } if (config.isDuplicateRequestFilteringEnabled()) { - // Check if the URL has already been crawled - String urlFingerprint = createFingerprintForUrl(request.getRequestUrl()); if (urlFingerprints.contains(urlFingerprint)) { @@ -111,26 +103,24 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { int crawlDepthLimit = config.getMaximumCrawlDepth(); int nextCrawlDepth = currentCandidate.getCrawlDepth() + 1; - // If a crawl depth limit is set, check if the candidate's crawl depth is less than or equal to the limit if (crawlDepthLimit != 0 && nextCrawlDepth > crawlDepthLimit) { return; } - builder = new CrawlCandidateBuilder(request).setRefererUrl(currentCandidate.getRequestUrl()) + builder = new CrawlCandidateBuilder(request) + .setRefererUrl(currentCandidate.getRequestUrl()) .setCrawlDepth(nextCrawlDepth); } else { builder = new CrawlCandidateBuilder(request); } - // Finally, add constructed candidate to the queue candidates.add(builder.build()); } /** * Indicates if there are any candidates left in the queue. * - * @return true if there are candidates in the queue, - * false otherwise + * @return true if there are candidates in the queue, false otherwise */ public boolean hasNextCandidate() { return !candidates.isEmpty(); @@ -147,23 +137,21 @@ public CrawlCandidate getNextCandidate() { } /** - * Creates the fingerprint of the given URL. + * Creates the fingerprint of the given URL. If the URL contains query parameters, it sorts + * them. This way URLs with different order of query parameters get the same fingerprint. * * @param url the URL for which the fingerprint is created + * * @return the fingerprint of the URL */ private static String createFingerprintForUrl(final URI url) { - // We start off with the host only StringBuilder truncatedUrl = new StringBuilder(url.getHost()); - // If there is a path in the URL, we append it after the host String path = url.getPath(); if (path != null && !"/".equals(path)) { truncatedUrl.append(path); } - // If there are any query params, we sort and append them to what we got so far - // This is required in order to detect already crawled URLs with different order of query params String query = url.getQuery(); if (query != null) { truncatedUrl.append("?"); @@ -180,20 +168,29 @@ private static String createFingerprintForUrl(final URI url) { } /** - * Creates a priority queue using the strategy specified in the - * configuration. + * Creates a priority queue using the strategy specified in the configuration. * - * @return the priority queue using the strategy specified in the - * configuration + * @return the priority queue using the strategy specified in the configuration */ + @SuppressWarnings("checkstyle:MissingSwitchDefault") private PriorityQueue createPriorityQueue() { + Function crawlDepthGetter + = (Function & Serializable) CrawlCandidate::getCrawlDepth; + Function priorityGetter + = (Function & Serializable) CrawlCandidate::getPriority; + switch (config.getCrawlStrategy()) { case BREADTH_FIRST: - return new PriorityQueue<>(Comparator.comparing((Function & Serializable) CrawlCandidate::getCrawlDepth) - .thenComparing((Function & Serializable) CrawlCandidate::getPriority, Comparator.reverseOrder())); + Comparator breadthFirstComparator = Comparator.comparing(crawlDepthGetter) + .thenComparing(priorityGetter, Comparator.reverseOrder()); + + return new PriorityQueue<>(breadthFirstComparator); case DEPTH_FIRST: - return new PriorityQueue<>(Comparator.comparing((Function & Serializable) CrawlCandidate::getCrawlDepth, Comparator.reverseOrder()) - .thenComparing((Function & Serializable) CrawlCandidate::getPriority, Comparator.reverseOrder())); + Comparator depthFirstComparator + = Comparator.comparing(crawlDepthGetter, Comparator.reverseOrder()) + .thenComparing(priorityGetter, Comparator.reverseOrder()); + + return new PriorityQueue<>(depthFirstComparator); } throw new IllegalArgumentException("Unsupported crawl strategy."); diff --git a/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java b/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java index 89d6c33..05e5898 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.internal; import com.github.peterbencze.serritor.api.CrawlCandidate; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java index 6d3926d..13e3484 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,20 +13,25 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.internal.crawldelaymechanism; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import org.openqa.selenium.JavascriptExecutor; /** - * A crawl delay mechanism, in which case the delay corresponds to the page - * loading time, if it is between the specified range, otherwise the minimum or - * maximum duration is used. + * A crawl delay mechanism, in which case the delay corresponds to the page loading time, if it is + * between the specified range, otherwise the minimum or maximum duration is used. * * @author Peter Bencze */ public final class AdaptiveCrawlDelayMechanism implements CrawlDelayMechanism { + private static final String BROWSER_COMPATIBILITY_JS = "return ('performance' in window) && " + + "('timing' in window.performance)"; + private static final String DELAY_CALCULATION_JS = "return performance.timing.loadEventEnd - " + + "performance.timing.navigationStart;"; + private final long minDelayInMillis; private final long maxDelayInMillis; private final JavascriptExecutor jsExecutor; @@ -34,12 +39,13 @@ public final class AdaptiveCrawlDelayMechanism implements CrawlDelayMechanism { /** * Creates an {@link AdaptiveCrawlDelayMechanism} instance. * - * @param config the crawler configuration which specifies the minimum and - * maximum delay - * @param jsExecutor the {@link org.openqa.selenium.WebDriver} instance - * which is capable of executing JavaScript + * @param config the crawler configuration which specifies the minimum and maximum delay + * @param jsExecutor the {@link org.openqa.selenium.WebDriver} instance which is capable of + * executing JavaScript */ - public AdaptiveCrawlDelayMechanism(final CrawlerConfiguration config, final JavascriptExecutor jsExecutor) { + public AdaptiveCrawlDelayMechanism( + final CrawlerConfiguration config, + final JavascriptExecutor jsExecutor) { minDelayInMillis = config.getMinimumCrawlDelayDurationInMillis(); maxDelayInMillis = config.getMaximumCrawlDelayDurationInMillis(); this.jsExecutor = jsExecutor; @@ -48,24 +54,22 @@ public AdaptiveCrawlDelayMechanism(final CrawlerConfiguration config, final Java /** * Checks if the browser supports the Navigation Timing API. * - * @return true if the browser is compatible, - * false otherwise + * @return true if the browser is compatible, false otherwise */ public boolean isBrowserCompatible() { - return (boolean) jsExecutor.executeScript("return ('performance' in window) && ('timing' in window.performance)"); + return (boolean) jsExecutor.executeScript(BROWSER_COMPATIBILITY_JS); } /** - * Calculates the page loading time and returns the delay accordingly, - * between the specified min-max range. If the calculated delay is smaller - * than the minimum, it returns the minimum delay. If the calculated delay - * is higher than the maximum, it returns the maximum delay. + * Calculates the page loading time and returns the delay accordingly, between the specified + * min-max range. If the calculated delay is smaller than the minimum, it returns the minimum + * delay. If the calculated delay is higher than the maximum, it returns the maximum delay. * * @return the delay in milliseconds */ @Override public long getDelay() { - long delayInMillis = (long) jsExecutor.executeScript("return performance.timing.loadEventEnd - performance.timing.navigationStart;"); + long delayInMillis = (long) jsExecutor.executeScript(DELAY_CALCULATION_JS); if (delayInMillis < minDelayInMillis) { return minDelayInMillis; diff --git a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java index d788ece..4f1d34d 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.internal.crawldelaymechanism; /** diff --git a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java index f287e8a..9713f8b 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,13 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.internal.crawldelaymechanism; import com.github.peterbencze.serritor.api.CrawlerConfiguration; /** - * A crawl delay mechanism, in which case the delay is constant and equals to - * the duration specified in the configuration. + * A crawl delay mechanism, in which case the delay is constant and equals to the duration specified + * in the configuration. * * @author Peter Bencze */ @@ -30,8 +31,7 @@ public final class FixedCrawlDelayMechanism implements CrawlDelayMechanism { /** * Creates a {@link FixedCrawlDelayMechanism} instance. * - * @param config the crawler configuration which specifies the fixed delay - * duration + * @param config the crawler configuration which specifies the fixed delay duration */ public FixedCrawlDelayMechanism(final CrawlerConfiguration config) { this.delayInMillis = config.getFixedCrawlDelayDurationInMillis(); diff --git a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java index cd2b035..a457da3 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,14 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.internal.crawldelaymechanism; import com.github.peterbencze.serritor.api.CrawlerConfiguration; import java.util.concurrent.ThreadLocalRandom; /** - * A crawl delay mechanism in which case the duration is randomized between the - * specified minimum and maximum range. + * A crawl delay mechanism in which case the duration is randomized between the specified minimum + * and maximum range. * * @author Peter Bencze */ @@ -32,8 +33,7 @@ public final class RandomCrawlDelayMechanism implements CrawlDelayMechanism { /** * Creates a {@link RandomCrawlDelayMechanism} instance. * - * @param config the crawler configuration which specifies the minimum and - * maximum delay. + * @param config the crawler configuration which specifies the minimum and maximum delay. */ public RandomCrawlDelayMechanism(final CrawlerConfiguration config) { lowerLimit = config.getMinimumCrawlDelayDurationInMillis(); @@ -41,8 +41,7 @@ public RandomCrawlDelayMechanism(final CrawlerConfiguration config) { } /** - * Returns a random delay between the minimum and maximum range specified in - * the configuration. + * Returns a random delay between the minimum and maximum range specified in the configuration. * * @return the delay in milliseconds */ diff --git a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java index 1758b17..d33da36 100644 --- a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java +++ b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.api.helper; import com.github.peterbencze.serritor.api.event.PageLoadEvent; @@ -69,7 +70,9 @@ public void initialize() { Mockito.when(mockedElementWithInvalidDomain.getAttribute(Mockito.eq(ATTRIBUTE))) .thenReturn(URL_WITH_INVALID_DOMAIN); - List elementList = Arrays.asList(mockedElementWithValidUrl, mockedElementWithInvalidUrlFormat, mockedElementWithInvalidDomain); + List elementList + = Arrays.asList(mockedElementWithValidUrl, mockedElementWithInvalidUrlFormat, + mockedElementWithInvalidDomain); Mockito.when(mockedDriver.findElements(By.tagName(TAG_NAME))) .thenReturn(elementList); diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java index 81d390d..4bdb829 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.internal; import com.google.common.net.InternetDomainName; @@ -37,13 +38,8 @@ public final class CrawlDomainTest { @Test public void testEquals() { - // A crawl domain should be equal with itself Assert.assertEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_0); - - // Crawl domains with the same domain should be equal Assert.assertEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_1); - - // Crawl domains with different domains should not be equal Assert.assertNotEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_2); } @@ -54,13 +50,8 @@ public void testHashCode() { @Test public void testContains() { - // A crawl domain should contain its own domain Assert.assertTrue(CRAWL_DOMAIN_0.contains(DOMAIN)); - - // A crawl domain should contain its own domain's subdomain Assert.assertTrue(CRAWL_DOMAIN_0.contains(SUBDOMAIN)); - - // A crawl domain should not contain a domain different from its own domain Assert.assertFalse(CRAWL_DOMAIN_2.contains(DOMAIN)); } } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java index a477d51..6ddf172 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.internal; import com.github.peterbencze.serritor.api.CrawlCandidate; @@ -40,11 +41,14 @@ public final class CrawlFrontierTest { // Allowed crawl domains private static final String ALLOWED_CRAWL_DOMAIN_0 = "root-url-0.com"; private static final String ALLOWED_CRAWL_DOMAIN_1 = "root-url-1.com"; - private static final List ALLOWED_CRAWL_DOMAINS = Arrays.asList(ALLOWED_CRAWL_DOMAIN_0, ALLOWED_CRAWL_DOMAIN_1); + private static final List ALLOWED_CRAWL_DOMAINS + = Arrays.asList(ALLOWED_CRAWL_DOMAIN_0, ALLOWED_CRAWL_DOMAIN_1); // Root URLs - private static final URI ROOT_URL_0 = URI.create("http://root-url-0.com?param1=foo¶m2=bar#fragment"); - private static final URI DUPLICATE_ROOT_URL_0 = URI.create("https://root-url-0.com?param2=bar¶m1=foo"); + private static final URI ROOT_URL_0 + = URI.create("http://root-url-0.com?param1=foo¶m2=bar#fragment"); + private static final URI DUPLICATE_ROOT_URL_0 + = URI.create("https://root-url-0.com?param2=bar¶m1=foo"); private static final URI ROOT_URL_1 = URI.create("http://root-url-1.com"); // Root URL crawl depth @@ -55,18 +59,25 @@ public final class CrawlFrontierTest { private static final int ROOT_URL_1_PRIORITY = 1; // Root URL crawl requests - private static final CrawlRequest ROOT_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_0).setPriority(ROOT_URL_0_PRIORITY).build(); - private static final CrawlRequest DUPLICATE_ROOT_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(DUPLICATE_ROOT_URL_0).build(); - private static final CrawlRequest ROOT_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_1).setPriority(ROOT_URL_1_PRIORITY).build(); - private static final List CRAWL_SEEDS = Arrays.asList(ROOT_URL_0_CRAWL_REQUEST, ROOT_URL_1_CRAWL_REQUEST); + private static final CrawlRequest ROOT_URL_0_CRAWL_REQUEST + = new CrawlRequestBuilder(ROOT_URL_0).setPriority(ROOT_URL_0_PRIORITY).build(); + private static final CrawlRequest DUPLICATE_ROOT_URL_0_CRAWL_REQUEST + = new CrawlRequestBuilder(DUPLICATE_ROOT_URL_0).build(); + private static final CrawlRequest ROOT_URL_1_CRAWL_REQUEST + = new CrawlRequestBuilder(ROOT_URL_1).setPriority(ROOT_URL_1_PRIORITY).build(); + private static final List CRAWL_SEEDS + = Arrays.asList(ROOT_URL_0_CRAWL_REQUEST, ROOT_URL_1_CRAWL_REQUEST); // Child URL path private static final String CHILD_URL_PATH = "/child"; // Child URLs - private static final URI CHILD_URL_0 = URI.create(String.format("http://root-url-0.com%s-0", CHILD_URL_PATH)); - private static final URI CHILD_URL_1 = URI.create(String.format("http://root-url-0.com%s-1", CHILD_URL_PATH)); - private static final URI CHILD_URL_2 = URI.create(String.format("http://root-url-1.com%s-0", CHILD_URL_PATH)); + private static final URI CHILD_URL_0 + = URI.create(String.format("http://root-url-0.com%s-0", CHILD_URL_PATH)); + private static final URI CHILD_URL_1 + = URI.create(String.format("http://root-url-0.com%s-1", CHILD_URL_PATH)); + private static final URI CHILD_URL_2 + = URI.create(String.format("http://root-url-1.com%s-0", CHILD_URL_PATH)); // Child URL crawl depth private static final int CHILD_URL_CRAWL_DEPTH = 1; @@ -76,10 +87,13 @@ public final class CrawlFrontierTest { private static final int CHILD_URL_1_PRIORITY = CHILD_URL_0_PRIORITY; private static final int CHILD_URL_2_PRIORITY = 1; - // Child URL crawl requests - private static final CrawlRequest CHILD_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_0).setPriority(CHILD_URL_0_PRIORITY).build(); - private static final CrawlRequest CHILD_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_1).setPriority(CHILD_URL_1_PRIORITY).build(); - private static final CrawlRequest CHILD_URL_2_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_2).setPriority(CHILD_URL_2_PRIORITY).build(); + // Child URL crawl requests + private static final CrawlRequest CHILD_URL_0_CRAWL_REQUEST + = new CrawlRequestBuilder(CHILD_URL_0).setPriority(CHILD_URL_0_PRIORITY).build(); + private static final CrawlRequest CHILD_URL_1_CRAWL_REQUEST + = new CrawlRequestBuilder(CHILD_URL_1).setPriority(CHILD_URL_1_PRIORITY).build(); + private static final CrawlRequest CHILD_URL_2_CRAWL_REQUEST + = new CrawlRequestBuilder(CHILD_URL_2).setPriority(CHILD_URL_2_PRIORITY).build(); // Offsite URL private static final URI OFFSITE_URL = URI.create("http://offsite-url.com"); @@ -88,7 +102,8 @@ public final class CrawlFrontierTest { private static final int OFFSITE_URL_PRIORITY = 0; // Offsite URL crawl request - private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST = new CrawlRequestBuilder(OFFSITE_URL).setPriority(OFFSITE_URL_PRIORITY).build(); + private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST + = new CrawlRequestBuilder(OFFSITE_URL).setPriority(OFFSITE_URL_PRIORITY).build(); // Max crawl depth private static final int MAX_CRAWL_DEPTH = 1; @@ -108,38 +123,27 @@ public void initialize() { @Test public void testHasNextCandidateWithCandidatesInQueue() { - // Check if there are any candidates in the queue, the method should return true Assert.assertTrue(frontier.hasNextCandidate()); - // Get the next candidate from the queue frontier.getNextCandidate(); - // Check if there are any candidates in the queue, the method should return true again Assert.assertTrue(frontier.hasNextCandidate()); - // Get the next candidate from the queue frontier.getNextCandidate(); - // Check if there are any candidates in the queue, the method should return false at this point Assert.assertFalse(frontier.hasNextCandidate()); - // Feed child crawl requests frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - // Check if there are any candidates in the queue, the method should return true Assert.assertTrue(frontier.hasNextCandidate()); - // Get the next candidate from the queue frontier.getNextCandidate(); - // Check if there are any candidates in the queue, the method should return true once again Assert.assertTrue(frontier.hasNextCandidate()); - // Get the next candidate from the queue frontier.getNextCandidate(); - // Finally, check if there are any candidates in the queue, the method should return false at this point Assert.assertFalse(frontier.hasNextCandidate()); } @@ -148,10 +152,9 @@ public void testHasNextCandidateWithEmptyQueue() { Mockito.when(config.getCrawlSeeds()) .thenReturn(Collections.EMPTY_SET); - // Create frontier without any crawl seeds + // Create crawl frontier without crawl seeds frontier = new CrawlFrontier(config); - // Check if there are any candidates in the queue, the method should return false Assert.assertFalse(frontier.hasNextCandidate()); } @@ -159,10 +162,8 @@ public void testHasNextCandidateWithEmptyQueue() { public void testEnabledDuplicateRequestFiltering() { clearCrawlCandidateQueue(); - // Feed a duplicate crawl request frontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, false); - // Check if the candidate was added to the queue, the method should return false Assert.assertFalse(frontier.hasNextCandidate()); } @@ -172,16 +173,11 @@ public void testDisabledDuplicateRequestFiltering() { Mockito.when(config.isDuplicateRequestFilteringEnabled()) .thenReturn(false); - // Clear the crawl candidate queue of the frontier clearCrawlCandidateQueue(); - // Feed a duplicate crawl request frontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, true); - // Check if the candidates was added to the queue, the method should return true Assert.assertTrue(frontier.hasNextCandidate()); - - // Check if the URLs match Assert.assertEquals(DUPLICATE_ROOT_URL_0, frontier.getNextCandidate().getRequestUrl()); } @@ -189,10 +185,8 @@ public void testDisabledDuplicateRequestFiltering() { public void testEnabledOffsiteRequestFiltering() { clearCrawlCandidateQueue(); - // Feed an offsite request frontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, false); - // Check if the candidate was added to the queue, the method should return false Assert.assertFalse(frontier.hasNextCandidate()); } @@ -202,90 +196,53 @@ public void testDisabledOffsiteRequestFiltering() { Mockito.when(config.isOffsiteRequestFilteringEnabled()) .thenReturn(false); - // Clear the crawl candidate queue of the frontier clearCrawlCandidateQueue(); - // Feed an offsite request frontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, false); - // Check if the candidates was added to the queue, the method should return true Assert.assertTrue(frontier.hasNextCandidate()); - - // Check if the URLs match - Assert.assertEquals(OFFSITE_URL.toString(), frontier.getNextCandidate().getRequestUrl().toString()); + Assert.assertEquals(OFFSITE_URL.toString(), + frontier.getNextCandidate().getRequestUrl().toString()); } @Test public void testGetNextCandidateUsingBreadthFirstCrawlStrategy() { - // Get the crawl candidate of root URL 1. CrawlCandidate nextCandidate = frontier.getNextCandidate(); - // Check the URL of this candidate, it should be root URL 1. Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl()); - - // Check the crawl depth of this candidate, it should be 0 because it is a root URL. Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - - // Check the priority of this candidate, it should be 1. Assert.assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority()); - // Feed a child request that come from root URL 1. frontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false); - // Get the crawl candidate of root URL 0. nextCandidate = frontier.getNextCandidate(); - // Check the URL of this candidate, it should be root URL 0. Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl()); - - // Check the crawl depth of this candidate, it should be 0 again because it is also a root URL. Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - - // Check the priority of this candidate, it should be 0. Assert.assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority()); - // Feed 2 child requests that come from root URL 0. frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - // Get the crawl candidate of child URL 2. nextCandidate = frontier.getNextCandidate(); - // Check the URL of this candidate, it should be child URL 2. Assert.assertEquals(CHILD_URL_2.toString(), nextCandidate.getRequestUrl().toString()); - - // Check the crawl depth of this candidate, it should be 1 because it is a child URL that comes from root URL 1. Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - - // Check the priority of this candidate, it should be 1. Assert.assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority()); - // Get the crawl candidate of a child URL. - // Note: a priority queue does not ensure FIFO order when elements have the same depth and priority + // a priority queue doesn't ensure FIFO order when elements have the same depth and priority nextCandidate = frontier.getNextCandidate(); - // Check the URL of this request, it should be a child URL. Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH)); - - // Check the crawl depth of this candidate, it should be 1 again because it is a child URL that comes from root URL 0. Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - // Get the priority of this candidate int previousChildCandidatePriority = nextCandidate.getPriority(); - // Get the crawl candidate of the next child URL. nextCandidate = frontier.getNextCandidate(); - // Check the URL of this candidate, it should be a child URL. Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH)); - - // Check the crawl depth of this candidate, it should be 1 again becaise it is another child URL that also comes from root URL 0. Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - - // Compare the priority of this candidate to the previous candidate's priority. Assert.assertEquals(previousChildCandidatePriority, nextCandidate.getPriority()); - - // There should be no more candidates left at this point. Assert.assertFalse(frontier.hasNextCandidate()); } @@ -297,75 +254,41 @@ public void testGetNextCandidateUsingDepthFirstCrawlStrategy() { // Create frontier with depth-first crawl strategy frontier = new CrawlFrontier(config); - // Get the crawl candidate of root URL 1 CrawlCandidate nextCandidate = frontier.getNextCandidate(); - // Check the URL of this candidate, it should be root URL 1 Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl()); - - // Check the crawl depth of this candidate, it should be 0 because it is a root URL Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - - // Check the priority of this candidate, it should be 1 Assert.assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority()); - // Feed a child request that comes from root URL 1 frontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false); - // Get the crawl candidate of a child URL - // Note: a priority queue does not ensure FIFO order when elements have the same depth and priority + // a priority queue doesn't ensure FIFO order when elements have the same depth and priority nextCandidate = frontier.getNextCandidate(); - // Check the URL of this candidate, it should be a child URL Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH)); - - // Check the crawl depth of this candidate, it should be 1 because it is a child URL that comes from root URL 1 Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - - // Check the priority of this candidate, it should be 1 Assert.assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority()); - // Get the crawl candidate of root URL 0. nextCandidate = frontier.getNextCandidate(); - // Check the URL of this candidate, it should be root URL 0 Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl()); - - // Check the crawl depth of this candidate, it should be 0 again because it is also a root URL Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - - // Check the priority of this candidate, it should be 0 Assert.assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority()); - // Feed 2 child requests that come from root URL 0 frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - // Get the crawl candidate of child URL 0 nextCandidate = frontier.getNextCandidate(); - // Check the URL of this candidate, it should be child URL 0 Assert.assertEquals(CHILD_URL_0.toString(), nextCandidate.getRequestUrl().toString()); - - // Check the crawl depth of this candidate, it should be 1 again because it is a child URL that comes from root URL 0 Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - - // Check the priority of this candidate, it should be 0 Assert.assertEquals(CHILD_URL_0_PRIORITY, nextCandidate.getPriority()); - // Get the crawl candidate of child URL 1 nextCandidate = frontier.getNextCandidate(); - // Check the URL of this candidate, it should be child URL 1 Assert.assertEquals(CHILD_URL_1.toString(), nextCandidate.getRequestUrl().toString()); - - // Check the crawl depth of this candidate, it should be 1 again becaise it is a child URL that also comes from root URL 0 Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth()); - - // Check the priority of this candidate, it should be 0 Assert.assertEquals(CHILD_URL_1_PRIORITY, nextCandidate.getPriority()); - - // There should be no more candidates left at this point Assert.assertFalse(frontier.hasNextCandidate()); } @@ -374,27 +297,20 @@ public void testCrawlDepthLimitation() { Mockito.when(config.getMaximumCrawlDepth()) .thenReturn(MAX_CRAWL_DEPTH); - // Clear the crawl candidate queue of the frontier clearCrawlCandidateQueue(); - // Feed a child request, its crawl depth will be 1 frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false); - // Get the crawl candidate of the previously added child URL CrawlCandidate nextCandidate = frontier.getNextCandidate(); - // Check its crawl depth, it should be less than or equal to the limit Assert.assertTrue(nextCandidate.getCrawlDepth() <= MAX_CRAWL_DEPTH); - // Feed another child request, its crawl depth will be 2 which is above the limit frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false); - // There should be no more candidates at this point Assert.assertFalse(frontier.hasNextCandidate()); } private void clearCrawlCandidateQueue() { - // Loop until there are no remaining candidates in the queue while (frontier.hasNextCandidate()) { frontier.getNextCandidate(); } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java index 98340d3..166df00 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.internal.crawldelaymechanism; import com.github.peterbencze.serritor.api.CrawlerConfiguration; @@ -55,31 +56,27 @@ public void initialize() { @Test public void testDelayLowerThanMinimum() { - // Return a delay which is lower than the predefined minimum Mockito.when(mockedJsExecutor.executeScript(Mockito.anyString())) .thenReturn(LOWER_DELAY_DURATION_IN_MILLIS); - // The minimum delay should be returned - Assert.assertEquals(mockedConfig.getMinimumCrawlDelayDurationInMillis(), crawlDelayMechanism.getDelay()); + Assert.assertEquals(mockedConfig.getMinimumCrawlDelayDurationInMillis(), + crawlDelayMechanism.getDelay()); } @Test public void testDelayHigherThanMaximum() { - // Return a delay which is higher than the predefined maximum Mockito.when(mockedJsExecutor.executeScript(Mockito.anyString())) .thenReturn(HIGHER_DELAY_DURATION_IN_MILLIS); - // The maximum delay should be returned - Assert.assertEquals(mockedConfig.getMaximumCrawlDelayDurationInMillis(), crawlDelayMechanism.getDelay()); + Assert.assertEquals(mockedConfig.getMaximumCrawlDelayDurationInMillis(), + crawlDelayMechanism.getDelay()); } @Test public void testDelayBetweenRange() { - // Return an in range delay Mockito.when(mockedJsExecutor.executeScript(Mockito.anyString())) .thenReturn(IN_RANGE_DELAY_DURATION_IN_MILLIS); - // The in range delay should be returned Assert.assertEquals(IN_RANGE_DELAY_DURATION_IN_MILLIS, crawlDelayMechanism.getDelay()); } } diff --git a/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java index d0a96ce..535f5f4 100644 --- a/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java +++ b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java @@ -1,4 +1,4 @@ -/* +/* * Copyright 2018 Peter Bencze. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package com.github.peterbencze.serritor.internal.crawldelaymechanism; import com.github.peterbencze.serritor.api.CrawlerConfiguration; @@ -40,7 +41,7 @@ public void initialize() { @Test public void testGetDelay() { - // The delay should be the same as in the configuration - Assert.assertEquals(config.getFixedCrawlDelayDurationInMillis(), crawlDelayMechanism.getDelay()); + Assert.assertEquals(config.getFixedCrawlDelayDurationInMillis(), + crawlDelayMechanism.getDelay()); } } From 24e76a09a2c728699b367309d9852852f7d11b31 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 12 Jun 2018 23:44:15 +0200 Subject: [PATCH 18/28] Publish Javadoc to GitHub Pages --- pom.xml | 55 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/pom.xml b/pom.xml index d57489c..8b9e42d 100644 --- a/pom.xml +++ b/pom.xml @@ -5,18 +5,18 @@ serritor 1.4.0 jar - + Serritor An open source web crawler framework built upon Selenium and written in Java https://github.com/peterbencze/serritor - + Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0 - + Peter Bencze @@ -26,13 +26,13 @@ - + scm:git:git://github.com/peterbencze/serritor.git scm:git:https://github.com/peterbencze/serritor.git https://github.com/peterbencze/serritor/tree/master - + ossrh @@ -43,13 +43,13 @@ https://oss.sonatype.org/service/local/staging/deploy/maven2/ - + UTF-8 1.8 1.8 - + org.seleniumhq.selenium @@ -79,7 +79,7 @@ test - + @@ -106,6 +106,12 @@ jar + + + javadoc + + site + @@ -156,6 +162,37 @@ true + + org.apache.maven.plugins + maven-site-plugin + 3.7.1 + + true + true + + + + org.apache.maven.plugins + maven-scm-publish-plugin + 3.0.0 + + github + ${project.scm.developerConnection} + gh-pages + Update Javadoc via Maven + ${project.reporting.outputDirectory}/apidocs + UTF-8 + true + + + + + publish-scm + + site + + + - \ No newline at end of file + From ce1ad97987fecefe004773142a4d7990270c3d51 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 13 Jun 2018 01:04:16 +0200 Subject: [PATCH 19/28] Fix fingerprint creation for URL with single slash path --- .../com/github/peterbencze/serritor/internal/CrawlFrontier.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index f18207b..c2c337c 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -148,7 +148,7 @@ private static String createFingerprintForUrl(final URI url) { StringBuilder truncatedUrl = new StringBuilder(url.getHost()); String path = url.getPath(); - if (path != null && !"/".equals(path)) { + if (path != null) { truncatedUrl.append(path); } From 0ef0eab7b09c2507639eb4f8a902517f9e7f5dd5 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Wed, 13 Jun 2018 23:31:03 +0200 Subject: [PATCH 20/28] Add logging --- .../peterbencze/serritor/api/BaseCrawler.java | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index e200a84..1ac57e4 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -33,6 +33,8 @@ import java.net.URI; import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.logging.Level; +import java.util.logging.Logger; import org.apache.commons.lang3.SerializationUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; @@ -58,6 +60,8 @@ */ public abstract class BaseCrawler { + private static final Logger LOGGER = Logger.getLogger(BaseCrawler.class.getName()); + private final CrawlerConfiguration config; private boolean isStopped; @@ -382,6 +386,7 @@ private void performDelay() { * Callback which gets called when the crawler is started. */ protected void onStart() { + LOGGER.info("onStart"); } /** @@ -390,6 +395,7 @@ protected void onStart() { * @param event the PageLoadEvent instance */ protected void onPageLoad(final PageLoadEvent event) { + LOGGER.log(Level.INFO, "onPageLoad: {0}", event.getCrawlCandidate().getRequestUrl()); } /** @@ -398,6 +404,7 @@ protected void onPageLoad(final PageLoadEvent event) { * @param event the NonHtmlContentEvent instance */ protected void onNonHtmlContent(final NonHtmlContentEvent event) { + LOGGER.log(Level.INFO, "onNonHtmlContent: {0}", event.getCrawlCandidate().getRequestUrl()); } /** @@ -406,6 +413,7 @@ protected void onNonHtmlContent(final NonHtmlContentEvent event) { * @param event the RequestErrorEvent instance */ protected void onRequestError(final RequestErrorEvent event) { + LOGGER.log(Level.INFO, "onRequestError: {0}", event.getCrawlCandidate().getRequestUrl()); } /** @@ -414,6 +422,11 @@ protected void onRequestError(final RequestErrorEvent event) { * @param event the RequestRedirectEvent instance */ protected void onRequestRedirect(final RequestRedirectEvent event) { + LOGGER.log(Level.INFO, "onRequestRedirect: {0} -> {1}", + new Object[]{ + event.getCrawlCandidate().getRequestUrl(), + event.getRedirectedCrawlRequest().getRequestUrl() + }); } /** @@ -423,11 +436,13 @@ protected void onRequestRedirect(final RequestRedirectEvent event) { * @param event the PageLoadTimeoutEvent instance */ protected void onPageLoadTimeout(final PageLoadTimeoutEvent event) { + LOGGER.log(Level.INFO, "onPageLoadTimeout: {0}", event.getCrawlCandidate().getRequestUrl()); } /** * Callback which gets called when the crawler is stopped. */ protected void onStop() { + LOGGER.info("onStop"); } } From 1560599948116e528ad0570cc57f10423657273b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Bencze?= Date: Wed, 13 Jun 2018 23:55:09 +0200 Subject: [PATCH 21/28] Update README --- README.md | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 1bf7e74..0cc726d 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,37 @@ Serritor ======== -Serritor is an open source web crawler framework built upon [Selenium](http://www.seleniumhq.org/) and written in Java. Crawling dynamic web pages is no longer a problem! +Serritor is an open source web crawler framework built upon [Selenium](http://www.seleniumhq.org/) and written in Java. It can be used to crawl dynamic web pages that use JavaScript. -## Installation -### Using Maven +## Using Serritor in your build +### Maven Add the following dependency to your pom.xml: ```xml com.github.peterbencze serritor - 1.3.1 + 1.4.0 ``` -### Without Maven +### Gradle + +Add the following dependency to your build.gradle: +```groovy +compile group: 'com.github.peterbencze', name: 'serritor', version: '1.4.0' +``` + +### Manual dependencies The standalone JAR files are available on the [releases](https://github.com/peterbencze/serritor/releases) page. ## Documentation -See the [Wiki](https://github.com/peterbencze/serritor/wiki) page. +* The [Wiki](https://github.com/peterbencze/serritor/wiki) contains usage information and examples +* The Javadoc is available [here](https://peterbencze.github.io/serritor/) ## Quickstart -_BaseCrawler_ provides a skeletal implementation of a crawler to minimize the effort to create your own. First, create a class that extends _BaseCrawler_. In this class, you can implement the behavior of your crawler. There are callbacks available for every stage of crawling. Below you can find an example: +The _BaseCrawler_ abstract class provides a skeletal implementation of a crawler to minimize the effort to create your own. The extending class should define the logic of the crawler. Below you can find a simple example that is enough to get you started: ```java public class MyCrawler extends BaseCrawler { @@ -37,31 +45,24 @@ public class MyCrawler extends BaseCrawler { } @Override - protected void onResponseComplete(final HtmlResponse response) { + protected void onPageLoad(final PageLoadEvent event) { // Crawl every URL that match the given pattern - urlFinder.findUrlsInResponse(response) + urlFinder.findUrlsInPage(event) .stream() .map(CrawlRequestBuilder::new) .map(CrawlRequestBuilder::build) .forEach(this::crawl); - } - - @Override - protected void onNonHtmlResponse(final NonHtmlResponse response) { - System.out.println("Received a non-HTML response from: " + response.getCrawlRequest().getRequestUrl()); - } - - @Override - protected void onUnsuccessfulRequest(final UnsuccessfulRequest request) { - System.out.println("Could not get response from: " + request.getCrawlRequest().getRequestUrl()); + + // ... } } ``` By default, the crawler uses [HtmlUnit headless browser](http://htmlunit.sourceforge.net/): ```java -public static void main(String[] args) { +public static void main(final String[] args) { // Create the configuration - CrawlerConfiguration config = new CrawlerConfigurationBuilder().setOffsiteRequestFiltering(true) + CrawlerConfiguration config = new CrawlerConfigurationBuilder() + .setOffsiteRequestFiltering(true) .addAllowedCrawlDomain("example.com") .addCrawlSeed(new CrawlRequestBuilder("http://example.com").build()) .build(); @@ -73,11 +74,12 @@ public static void main(String[] args) { crawler.start(); } ``` -Of course, you can also use any other browsers by specifying a corresponding _WebDriver_ instance: +Of course, you can also use any other browsers by specifying a corresponding `WebDriver` instance: ```java -public static void main(String[] args) { +public static void main(final String[] args) { // Create the configuration - CrawlerConfiguration config = new CrawlerConfigurationBuilder().setOffsiteRequestFiltering(true) + CrawlerConfiguration config = new CrawlerConfigurationBuilder() + .setOffsiteRequestFiltering(true) .addAllowedCrawlDomain("example.com") .addCrawlSeed(new CrawlRequestBuilder("http://example.com").build()) .build(); @@ -90,7 +92,7 @@ public static void main(String[] args) { } ``` -That's it! In just a few lines you can make a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the _WebDriver_ instance, so you can use all the features that are provided by Selenium. +That's it! In just a few lines you can create a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the `WebDriver` instance, so you can use all the features that are provided by Selenium. ## License The source code of Serritor is made available under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). From 23a18129ce7f579ac02010bd0788191ab068cd4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Bencze?= Date: Sat, 16 Jun 2018 18:12:24 +0200 Subject: [PATCH 22/28] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0cc726d..a3b4f4d 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ The standalone JAR files are available on the [releases](https://github.com/pete * The Javadoc is available [here](https://peterbencze.github.io/serritor/) ## Quickstart -The _BaseCrawler_ abstract class provides a skeletal implementation of a crawler to minimize the effort to create your own. The extending class should define the logic of the crawler. Below you can find a simple example that is enough to get you started: +The `BaseCrawler` abstract class provides a skeletal implementation of a crawler to minimize the effort to create your own. The extending class should define the logic of the crawler. Below you can find a simple example that is enough to get you started: ```java public class MyCrawler extends BaseCrawler { From 7337ef8b3d6eef3d9f3dd889550a0530bee5ed66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Bencze?= Date: Sun, 17 Jun 2018 00:48:57 +0200 Subject: [PATCH 23/28] Update README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a3b4f4d..b4f25c8 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,9 @@ The standalone JAR files are available on the [releases](https://github.com/pete * The Javadoc is available [here](https://peterbencze.github.io/serritor/) ## Quickstart -The `BaseCrawler` abstract class provides a skeletal implementation of a crawler to minimize the effort to create your own. The extending class should define the logic of the crawler. Below you can find a simple example that is enough to get you started: +The `BaseCrawler` abstract class provides a skeletal implementation of a crawler to minimize the effort to create your own. The extending class should define the logic of the crawler. + +Below you can find a simple example that is enough to get you started: ```java public class MyCrawler extends BaseCrawler { From 6e5d3bc9ec0dbfed0b12efcea71b711881d2735d Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Tue, 19 Jun 2018 22:03:27 +0200 Subject: [PATCH 24/28] Remove unnecessary field --- .../github/peterbencze/serritor/internal/CrawlFrontier.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java index c2c337c..d3fb6e0 100644 --- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java +++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java @@ -40,10 +40,7 @@ public final class CrawlFrontier implements Serializable { private final CrawlerConfiguration config; - - private final Set allowedCrawlDomains; private final Set urlFingerprints; - private final Queue candidates; private CrawlCandidate currentCandidate; @@ -55,7 +52,6 @@ public final class CrawlFrontier implements Serializable { */ public CrawlFrontier(final CrawlerConfiguration config) { this.config = config; - allowedCrawlDomains = config.getAllowedCrawlDomains(); urlFingerprints = new HashSet<>(); candidates = createPriorityQueue(); @@ -75,7 +71,7 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) { if (config.isOffsiteRequestFilteringEnabled()) { boolean inCrawlDomain = false; - for (CrawlDomain allowedCrawlDomain : allowedCrawlDomains) { + for (CrawlDomain allowedCrawlDomain : config.getAllowedCrawlDomains()) { if (allowedCrawlDomain.contains(request.getDomain())) { inCrawlDomain = true; break; From 3a727e10d76988a6307b569c87714b8156c22d7b Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 21 Jun 2018 00:48:45 +0200 Subject: [PATCH 25/28] Fix possible inconsistent state when resuming crawls --- .../peterbencze/serritor/api/BaseCrawler.java | 73 ++++++++++++------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 1ac57e4..76470c3 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -30,7 +30,9 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.io.Serializable; import java.net.URI; +import java.util.HashMap; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.logging.Level; @@ -62,15 +64,15 @@ public abstract class BaseCrawler { private static final Logger LOGGER = Logger.getLogger(BaseCrawler.class.getName()); - private final CrawlerConfiguration config; - - private boolean isStopped; - private boolean stopCrawling; + private CrawlerConfiguration config; + private CrawlFrontier crawlFrontier; private BasicCookieStore cookieStore; private HttpClient httpClient; private WebDriver webDriver; - private CrawlFrontier crawlFrontier; private CrawlDelayMechanism crawlDelayMechanism; + private boolean isStopped; + private boolean isStopping; + private boolean canSaveState; /** * Base constructor of all crawlers. @@ -82,6 +84,9 @@ protected BaseCrawler(final CrawlerConfiguration config) { // Indicate that the crawler is not running isStopped = true; + + // Cannot save state until the crawler has not been started at least once + canSaveState = false; } /** @@ -97,34 +102,38 @@ public final void start() { * @param webDriver the WebDriver instance to control the browser */ public final void start(final WebDriver webDriver) { - start(webDriver, new CrawlFrontier(config)); + start(webDriver, false); } /** - * Initializes and runs the crawler. + * Performs initialization and runs the crawler. * - * @param crawlFrontier the CrawlFrontier instance to be used by the crawler to - * manage crawl requests + * @param isResuming indicates if a previously saved state is to be resumed */ - private void start(final WebDriver webDriver, final CrawlFrontier crawlFrontier) { + private void start(final WebDriver webDriver, final boolean isResuming) { try { - Validate.validState(isStopped, "The crawler is already started."); + Validate.validState(isStopped, "The crawler is already running."); + + this.webDriver = Validate.notNull(webDriver, "The webdriver cannot be null."); + + if (!isResuming) { + cookieStore = new BasicCookieStore(); + crawlFrontier = new CrawlFrontier(config); + } - isStopped = false; - cookieStore = new BasicCookieStore(); httpClient = HttpClientBuilder.create() .setDefaultCookieStore(cookieStore) .build(); - this.webDriver = Validate.notNull(webDriver, "The webdriver cannot be null."); - this.crawlFrontier = crawlFrontier; crawlDelayMechanism = createCrawlDelayMechanism(); + isStopped = false; + canSaveState = true; run(); } finally { // Always close the browser webDriver.quit(); - stopCrawling = false; + isStopping = false; isStopped = true; } } @@ -135,12 +144,15 @@ private void start(final WebDriver webDriver, final CrawlFrontier crawlFrontier) * @param out the output stream */ public final void saveState(final OutputStream out) { - // Check if the crawler has been started at least once, otherwise we have nothing to save - Validate.validState(crawlFrontier != null, - "Cannot save state at this point. The crawler should be started first."); + Validate.validState(canSaveState, + "Cannot save state at this point. The crawler should be started at least once."); + + HashMap, Serializable> stateObjects = new HashMap<>(); + stateObjects.put(config.getClass(), config); + stateObjects.put(crawlFrontier.getClass(), crawlFrontier); + stateObjects.put(cookieStore.getClass(), cookieStore); - // Save the crawl frontier's current state - SerializationUtils.serialize(crawlFrontier, out); + SerializationUtils.serialize(stateObjects, out); } /** @@ -160,10 +172,15 @@ public final void resumeState(final InputStream in) { * @param in the input stream from which the state should be loaded */ public final void resumeState(final WebDriver webDriver, final InputStream in) { - // Re-create crawl frontier from the saved state - CrawlFrontier deserializedCrawlFrontier = SerializationUtils.deserialize(in); + HashMap, Serializable> stateObjects + = SerializationUtils.deserialize(in); + + config = (CrawlerConfiguration) stateObjects.get(CrawlerConfiguration.class); + crawlFrontier = (CrawlFrontier) stateObjects.get(CrawlFrontier.class); + cookieStore = (BasicCookieStore) stateObjects.get(BasicCookieStore.class); - start(webDriver, deserializedCrawlFrontier); + // Resume crawling + start(webDriver, true); } /** @@ -171,10 +188,10 @@ public final void resumeState(final WebDriver webDriver, final InputStream in) { */ public final void stop() { Validate.validState(!isStopped, "The crawler is not started."); - Validate.validState(!stopCrawling, "The stop method has already been called."); + Validate.validState(!isStopping, "The stop method has already been called."); // Indicate that the crawling should be stopped - stopCrawling = true; + isStopping = true; } /** @@ -207,7 +224,7 @@ protected final void crawl(final List requests) { private void run() { onStart(); - while (!stopCrawling && crawlFrontier.hasNextCandidate()) { + while (!isStopping && crawlFrontier.hasNextCandidate()) { CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate(); String candidateUrl = currentCandidate.getRequestUrl().toString(); HttpClientContext context = HttpClientContext.create(); @@ -378,7 +395,7 @@ private void performDelay() { TimeUnit.MILLISECONDS.sleep(crawlDelayMechanism.getDelay()); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); - stopCrawling = true; + isStopping = true; } } From a7c6c018e6db851f8e5915536fdd8194bc2b0224 Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Thu, 21 Jun 2018 22:06:50 +0200 Subject: [PATCH 26/28] Fix incorrect event handling --- .../peterbencze/serritor/api/BaseCrawler.java | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 76470c3..75fd471 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -254,19 +254,24 @@ private void run() { // Create a new crawl request for the redirected URL handleRequestRedirect(currentCandidate, responseUrl); } else if (isContentHtml(httpHeadResponse)) { + boolean isTimedOut = false; + try { // Open URL in browser webDriver.get(candidateUrl); } catch (TimeoutException exception) { + isTimedOut = true; onPageLoadTimeout(new PageLoadTimeoutEvent(currentCandidate, exception)); } - String loadedPageUrl = webDriver.getCurrentUrl(); - if (!loadedPageUrl.equals(candidateUrl)) { - // Create a new crawl request for the redirected URL (JavaScript redirect) - handleRequestRedirect(currentCandidate, loadedPageUrl); - } else { - onPageLoad(new PageLoadEvent(currentCandidate, webDriver)); + if (!isTimedOut) { + String loadedPageUrl = webDriver.getCurrentUrl(); + if (!loadedPageUrl.equals(candidateUrl)) { + // Create a new crawl request for the redirected URL (JS redirect) + handleRequestRedirect(currentCandidate, loadedPageUrl); + } else { + onPageLoad(new PageLoadEvent(currentCandidate, webDriver)); + } } } else { // URLs that point to non-HTML content should not be opened in the browser From 107c4159a1ad369836713081fcb8278d3b9b12e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Bencze?= Date: Thu, 21 Jun 2018 23:39:03 +0200 Subject: [PATCH 27/28] Update README --- README.md | 52 ++++++++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index b4f25c8..66d4da1 100644 --- a/README.md +++ b/README.md @@ -61,37 +61,33 @@ public class MyCrawler extends BaseCrawler { ``` By default, the crawler uses [HtmlUnit headless browser](http://htmlunit.sourceforge.net/): ```java -public static void main(final String[] args) { - // Create the configuration - CrawlerConfiguration config = new CrawlerConfigurationBuilder() - .setOffsiteRequestFiltering(true) - .addAllowedCrawlDomain("example.com") - .addCrawlSeed(new CrawlRequestBuilder("http://example.com").build()) - .build(); - - // Create the crawler using the configuration above - MyCrawler crawler = new MyCrawler(config); - - // Start it - crawler.start(); -} +// Create the configuration +CrawlerConfiguration config = new CrawlerConfigurationBuilder() + .setOffsiteRequestFiltering(true) + .addAllowedCrawlDomain("example.com") + .addCrawlSeed(new CrawlRequestBuilder("http://example.com").build()) + .build(); + +// Create the crawler using the configuration above +MyCrawler crawler = new MyCrawler(config); + +// Start it +crawler.start(); ``` Of course, you can also use any other browsers by specifying a corresponding `WebDriver` instance: ```java -public static void main(final String[] args) { - // Create the configuration - CrawlerConfiguration config = new CrawlerConfigurationBuilder() - .setOffsiteRequestFiltering(true) - .addAllowedCrawlDomain("example.com") - .addCrawlSeed(new CrawlRequestBuilder("http://example.com").build()) - .build(); - - // Create the crawler using the configuration above - MyCrawler crawler = new MyCrawler(config); - - // Start it - crawler.start(new ChromeDriver()); -} +// Create the configuration +CrawlerConfiguration config = new CrawlerConfigurationBuilder() + .setOffsiteRequestFiltering(true) + .addAllowedCrawlDomain("example.com") + .addCrawlSeed(new CrawlRequestBuilder("http://example.com").build()) + .build(); + +// Create the crawler using the configuration above +MyCrawler crawler = new MyCrawler(config); + +// Start it +crawler.start(new ChromeDriver()); ``` That's it! In just a few lines you can create a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the `WebDriver` instance, so you can use all the features that are provided by Selenium. From 2264bdd4444b595d07b281e18ad3d8796f59d9ab Mon Sep 17 00:00:00 2001 From: Peter Bencze Date: Sat, 23 Jun 2018 15:40:08 +0200 Subject: [PATCH 28/28] Change exception message --- .../java/com/github/peterbencze/serritor/api/BaseCrawler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java index 75fd471..75bb6d2 100644 --- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java +++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java @@ -188,7 +188,7 @@ public final void resumeState(final WebDriver webDriver, final InputStream in) { */ public final void stop() { Validate.validState(!isStopped, "The crawler is not started."); - Validate.validState(!isStopping, "The stop method has already been called."); + Validate.validState(!isStopping, "The crawler is already stopping."); // Indicate that the crawling should be stopped isStopping = true;