diff --git a/README.md b/README.md
index 1bf7e74..66d4da1 100644
--- a/README.md
+++ b/README.md
@@ -1,29 +1,39 @@
Serritor
========
-Serritor is an open source web crawler framework built upon [Selenium](http://www.seleniumhq.org/) and written in Java. Crawling dynamic web pages is no longer a problem!
+Serritor is an open source web crawler framework built upon [Selenium](http://www.seleniumhq.org/) and written in Java. It can be used to crawl dynamic web pages that use JavaScript.
-## Installation
-### Using Maven
+## Using Serritor in your build
+### Maven
Add the following dependency to your pom.xml:
```xml
com.github.peterbencze
serritor
- 1.3.1
+ 1.4.0
```
-### Without Maven
+### Gradle
+
+Add the following dependency to your build.gradle:
+```groovy
+compile group: 'com.github.peterbencze', name: 'serritor', version: '1.4.0'
+```
+
+### Manual dependencies
The standalone JAR files are available on the [releases](https://github.com/peterbencze/serritor/releases) page.
## Documentation
-See the [Wiki](https://github.com/peterbencze/serritor/wiki) page.
+* The [Wiki](https://github.com/peterbencze/serritor/wiki) contains usage information and examples
+* The Javadoc is available [here](https://peterbencze.github.io/serritor/)
## Quickstart
-_BaseCrawler_ provides a skeletal implementation of a crawler to minimize the effort to create your own. First, create a class that extends _BaseCrawler_. In this class, you can implement the behavior of your crawler. There are callbacks available for every stage of crawling. Below you can find an example:
+The `BaseCrawler` abstract class provides a skeletal implementation of a crawler to minimize the effort to create your own. The extending class should define the logic of the crawler.
+
+Below you can find a simple example that is enough to get you started:
```java
public class MyCrawler extends BaseCrawler {
@@ -37,60 +47,50 @@ public class MyCrawler extends BaseCrawler {
}
@Override
- protected void onResponseComplete(final HtmlResponse response) {
+ protected void onPageLoad(final PageLoadEvent event) {
// Crawl every URL that match the given pattern
- urlFinder.findUrlsInResponse(response)
+ urlFinder.findUrlsInPage(event)
.stream()
.map(CrawlRequestBuilder::new)
.map(CrawlRequestBuilder::build)
.forEach(this::crawl);
- }
-
- @Override
- protected void onNonHtmlResponse(final NonHtmlResponse response) {
- System.out.println("Received a non-HTML response from: " + response.getCrawlRequest().getRequestUrl());
- }
-
- @Override
- protected void onUnsuccessfulRequest(final UnsuccessfulRequest request) {
- System.out.println("Could not get response from: " + request.getCrawlRequest().getRequestUrl());
+
+ // ...
}
}
```
By default, the crawler uses [HtmlUnit headless browser](http://htmlunit.sourceforge.net/):
```java
-public static void main(String[] args) {
- // Create the configuration
- CrawlerConfiguration config = new CrawlerConfigurationBuilder().setOffsiteRequestFiltering(true)
- .addAllowedCrawlDomain("example.com")
- .addCrawlSeed(new CrawlRequestBuilder("http://example.com").build())
- .build();
-
- // Create the crawler using the configuration above
- MyCrawler crawler = new MyCrawler(config);
-
- // Start it
- crawler.start();
-}
+// Create the configuration
+CrawlerConfiguration config = new CrawlerConfigurationBuilder()
+ .setOffsiteRequestFiltering(true)
+ .addAllowedCrawlDomain("example.com")
+ .addCrawlSeed(new CrawlRequestBuilder("http://example.com").build())
+ .build();
+
+// Create the crawler using the configuration above
+MyCrawler crawler = new MyCrawler(config);
+
+// Start it
+crawler.start();
```
-Of course, you can also use any other browsers by specifying a corresponding _WebDriver_ instance:
+Of course, you can also use any other browsers by specifying a corresponding `WebDriver` instance:
```java
-public static void main(String[] args) {
- // Create the configuration
- CrawlerConfiguration config = new CrawlerConfigurationBuilder().setOffsiteRequestFiltering(true)
- .addAllowedCrawlDomain("example.com")
- .addCrawlSeed(new CrawlRequestBuilder("http://example.com").build())
- .build();
-
- // Create the crawler using the configuration above
- MyCrawler crawler = new MyCrawler(config);
-
- // Start it
- crawler.start(new ChromeDriver());
-}
+// Create the configuration
+CrawlerConfiguration config = new CrawlerConfigurationBuilder()
+ .setOffsiteRequestFiltering(true)
+ .addAllowedCrawlDomain("example.com")
+ .addCrawlSeed(new CrawlRequestBuilder("http://example.com").build())
+ .build();
+
+// Create the crawler using the configuration above
+MyCrawler crawler = new MyCrawler(config);
+
+// Start it
+crawler.start(new ChromeDriver());
```
-That's it! In just a few lines you can make a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the _WebDriver_ instance, so you can use all the features that are provided by Selenium.
+That's it! In just a few lines you can create a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the `WebDriver` instance, so you can use all the features that are provided by Selenium.
## License
The source code of Serritor is made available under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).
diff --git a/checkstyle.xml b/checkstyle.xml
new file mode 100644
index 0000000..52ef575
--- /dev/null
+++ b/checkstyle.xml
@@ -0,0 +1,255 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pom.xml b/pom.xml
index 7467534..8b9e42d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,20 +3,20 @@
4.0.0
com.github.peterbencze
serritor
- 1.3.1
+ 1.4.0
jar
-
+
Serritor
An open source web crawler framework built upon Selenium and written in Java
https://github.com/peterbencze/serritor
-
+
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0
-
+
Peter Bencze
@@ -26,13 +26,13 @@
-
+
scm:git:git://github.com/peterbencze/serritor.git
scm:git:https://github.com/peterbencze/serritor.git
https://github.com/peterbencze/serritor/tree/master
-
+
ossrh
@@ -43,28 +43,28 @@
https://oss.sonatype.org/service/local/staging/deploy/maven2/
-
+
UTF-8
1.8
1.8
-
+
org.seleniumhq.selenium
selenium-java
- 3.11.0
+ 3.12.0
org.seleniumhq.selenium
htmlunit-driver
- 2.29.3
+ 2.31.0
com.google.guava
guava
- 24.1-jre
+ 25.1-jre
junit
@@ -75,11 +75,11 @@
org.mockito
mockito-core
- 2.18.0
+ 2.18.3
test
-
+
@@ -98,7 +98,7 @@
org.apache.maven.plugins
maven-javadoc-plugin
- 2.10.4
+ 3.0.1
attach-javadoc
@@ -106,6 +106,35 @@
jar
+
+
+ javadoc
+
+ site
+
+
+
+
+ org.apache.maven.plugins
+ maven-checkstyle-plugin
+ 3.0.0
+
+
+ com.puppycrawl.tools
+ checkstyle
+ 8.10.1
+
+
+
+ true
+ checkstyle.xml
+
+
+
+
+ check
+
+
@@ -133,6 +162,37 @@
true
+
+ org.apache.maven.plugins
+ maven-site-plugin
+ 3.7.1
+
+ true
+ true
+
+
+
+ org.apache.maven.plugins
+ maven-scm-publish-plugin
+ 3.0.0
+
+ github
+ ${project.scm.developerConnection}
+ gh-pages
+ Update Javadoc via Maven
+ ${project.reporting.outputDirectory}/apidocs
+ UTF-8
+ true
+
+
+
+
+ publish-scm
+
+ site
+
+
+
-
\ No newline at end of file
+
diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
index a35f72e..75bb6d2 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2017 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,67 +13,80 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package com.github.peterbencze.serritor.api;
import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder;
-import com.github.peterbencze.serritor.api.HtmlResponse.HtmlResponseBuilder;
-import com.github.peterbencze.serritor.api.NonHtmlResponse.NonHtmlResponseBuilder;
-import com.github.peterbencze.serritor.api.UnsuccessfulRequest.UnsuccessfulRequestBuilder;
-import com.github.peterbencze.serritor.internal.AdaptiveCrawlDelayMechanism;
-import com.github.peterbencze.serritor.internal.CrawlCandidate;
-import com.github.peterbencze.serritor.internal.CrawlDelayMechanism;
+import com.github.peterbencze.serritor.api.event.NonHtmlContentEvent;
+import com.github.peterbencze.serritor.api.event.PageLoadEvent;
+import com.github.peterbencze.serritor.api.event.PageLoadTimeoutEvent;
+import com.github.peterbencze.serritor.api.event.RequestErrorEvent;
+import com.github.peterbencze.serritor.api.event.RequestRedirectEvent;
import com.github.peterbencze.serritor.internal.CrawlFrontier;
-import com.github.peterbencze.serritor.internal.FixedCrawlDelayMechanism;
-import com.github.peterbencze.serritor.internal.RandomCrawlDelayMechanism;
+import com.github.peterbencze.serritor.internal.crawldelaymechanism.AdaptiveCrawlDelayMechanism;
+import com.github.peterbencze.serritor.internal.crawldelaymechanism.CrawlDelayMechanism;
+import com.github.peterbencze.serritor.internal.crawldelaymechanism.FixedCrawlDelayMechanism;
+import com.github.peterbencze.serritor.internal.crawldelaymechanism.RandomCrawlDelayMechanism;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.io.Serializable;
import java.net.URI;
+import java.util.HashMap;
import java.util.List;
import java.util.concurrent.TimeUnit;
+import java.util.logging.Level;
+import java.util.logging.Logger;
import org.apache.commons.lang3.SerializationUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpHead;
import org.apache.http.client.protocol.HttpClientContext;
+import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.impl.cookie.BasicClientCookie;
+import org.openqa.selenium.Cookie;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.TimeoutException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
/**
- * Provides a skeletal implementation of a crawler to minimize the effort for
- * users to implement their own.
+ * Provides a skeletal implementation of a crawler to minimize the effort for users to implement
+ * their own.
*
* @author Peter Bencze
*/
public abstract class BaseCrawler {
- private final CrawlerConfiguration config;
-
- // Indicates if the crawler is currently running or not
- private boolean isStopped;
+ private static final Logger LOGGER = Logger.getLogger(BaseCrawler.class.getName());
- // Indicates if the crawling should be stopped (used for cancelling the loop in the run method)
- private boolean stopCrawling;
-
- // Used for sending HTTP HEAD requests and receiving associate responses
+ private CrawlerConfiguration config;
+ private CrawlFrontier crawlFrontier;
+ private BasicCookieStore cookieStore;
private HttpClient httpClient;
-
private WebDriver webDriver;
-
- private CrawlFrontier crawlFrontier;
-
private CrawlDelayMechanism crawlDelayMechanism;
+ private boolean isStopped;
+ private boolean isStopping;
+ private boolean canSaveState;
+ /**
+ * Base constructor of all crawlers.
+ *
+ * @param config the configuration of the crawler
+ */
protected BaseCrawler(final CrawlerConfiguration config) {
this.config = config;
// Indicate that the crawler is not running
isStopped = true;
+
+ // Cannot save state until the crawler has not been started at least once
+ canSaveState = false;
}
/**
@@ -84,77 +97,90 @@ public final void start() {
}
/**
- * Starts the crawler using the browser specified by the
- * WebDriver
instance.
+ * Starts the crawler using the browser specified by the given WebDriver
instance.
*
- * @param driver The WebDriver
instance that will be used by
- * the crawler
+ * @param webDriver the WebDriver
instance to control the browser
*/
- public final void start(final WebDriver driver) {
- start(driver, new CrawlFrontier(config));
+ public final void start(final WebDriver webDriver) {
+ start(webDriver, false);
}
/**
- * Constructs all the necessary objects and runs the crawler.
+ * Performs initialization and runs the crawler.
*
- * @param frontierToUse The CrawlFrontier
instance to be used
- * by the crawler.
+ * @param isResuming indicates if a previously saved state is to be resumed
*/
- private void start(final WebDriver driver, final CrawlFrontier frontierToUse) {
+ private void start(final WebDriver webDriver, final boolean isResuming) {
try {
- Validate.validState(isStopped, "The crawler is already started.");
+ Validate.validState(isStopped, "The crawler is already running.");
- isStopped = false;
- httpClient = HttpClientBuilder.create().build();
- webDriver = Validate.notNull(driver, "The webdriver cannot be null.");
- crawlFrontier = frontierToUse;
+ this.webDriver = Validate.notNull(webDriver, "The webdriver cannot be null.");
+
+ if (!isResuming) {
+ cookieStore = new BasicCookieStore();
+ crawlFrontier = new CrawlFrontier(config);
+ }
+
+ httpClient = HttpClientBuilder.create()
+ .setDefaultCookieStore(cookieStore)
+ .build();
crawlDelayMechanism = createCrawlDelayMechanism();
+ isStopped = false;
+ canSaveState = true;
run();
} finally {
// Always close the browser
webDriver.quit();
- stopCrawling = false;
+ isStopping = false;
isStopped = true;
}
}
/**
- * Saves the current state of the crawler to the specified output stream.
+ * Saves the current state of the crawler to the given output stream.
*
- * @param out The OutputStream
instance to use
+ * @param out the output stream
*/
public final void saveState(final OutputStream out) {
- // Check if the crawler has been started at least once, otherwise we have nothing to save
- Validate.validState(crawlFrontier != null, "Cannot save state at this point. The crawler should be started first.");
+ Validate.validState(canSaveState,
+ "Cannot save state at this point. The crawler should be started at least once.");
- // Save the crawl frontier's current state
- SerializationUtils.serialize(crawlFrontier, out);
+ HashMap, Serializable> stateObjects = new HashMap<>();
+ stateObjects.put(config.getClass(), config);
+ stateObjects.put(crawlFrontier.getClass(), crawlFrontier);
+ stateObjects.put(cookieStore.getClass(), cookieStore);
+
+ SerializationUtils.serialize(stateObjects, out);
}
/**
* Resumes a previously saved state using HtmlUnit headless browser.
*
- * @param in The InputStream
instance to use
+ * @param in the input stream from which the state should be loaded
*/
public final void resumeState(final InputStream in) {
resumeState(new HtmlUnitDriver(true), in);
}
/**
- * Resumes a previously saved state using the browser specified by the
- * WebDriver instance.
+ * Resumes a previously saved state using the browser specified by the given
+ * WebDriver
instance.
*
- * @param driver The WebDriver
instance to be used by the
- * crawler
- * @param in The InputStream
instance to use
+ * @param webDriver the WebDriver
instance to control the browser
+ * @param in the input stream from which the state should be loaded
*/
- public final void resumeState(final WebDriver driver, final InputStream in) {
- // Re-create crawl frontier from the saved state
- CrawlFrontier frontierToUse = SerializationUtils.deserialize(in);
+ public final void resumeState(final WebDriver webDriver, final InputStream in) {
+ HashMap, Serializable> stateObjects
+ = SerializationUtils.deserialize(in);
+
+ config = (CrawlerConfiguration) stateObjects.get(CrawlerConfiguration.class);
+ crawlFrontier = (CrawlFrontier) stateObjects.get(CrawlFrontier.class);
+ cookieStore = (BasicCookieStore) stateObjects.get(BasicCookieStore.class);
- start(driver, frontierToUse);
+ // Resume crawling
+ start(webDriver, true);
}
/**
@@ -162,34 +188,31 @@ public final void resumeState(final WebDriver driver, final InputStream in) {
*/
public final void stop() {
Validate.validState(!isStopped, "The crawler is not started.");
- Validate.validState(!stopCrawling, "The stop method has already been called.");
+ Validate.validState(!isStopping, "The crawler is already stopping.");
// Indicate that the crawling should be stopped
- stopCrawling = true;
+ isStopping = true;
}
/**
- * Passes a crawl request to the crawl frontier. The crawler must be
- * running, otherwise use
- * {@link CrawlerConfiguration.CrawlerConfigurationBuilder#addCrawlSeed(com.github.peterbencze.serritor.api.CrawlRequest)}
- * for adding crawl seeds.
+ * Feeds a crawl request to the crawler. The crawler should be running, otherwise the request
+ * has to be added as a crawl seed instead.
*
- * @param request The CrawlRequest
instance
+ * @param request the crawl request
*/
protected final void crawl(final CrawlRequest request) {
Validate.notNull(request, "The request cannot be null.");
- Validate.validState(!isStopped, "The crawler is not started. Maybe you meant to add this request as a crawl seed?");
+ Validate.validState(!isStopped,
+ "The crawler is not started. Maybe you meant to add this request as a crawl seed?");
crawlFrontier.feedRequest(request, false);
}
/**
- * Passes multiple crawl requests to the crawl frontier. The crawler must be
- * running, otherwise use
- * {@link CrawlerConfiguration.CrawlerConfigurationBuilder#addCrawlSeeds(java.util.List)}
- * for adding crawl seeds.
+ * Feeds multiple crawl requests to the crawler. The crawler should be running, otherwise the
+ * requests have to be added as crawl seeds instead.
*
- * @param requests The list of CrawlRequest
instances
+ * @param requests the list of crawl requests
*/
protected final void crawl(final List requests) {
requests.forEach(this::crawl);
@@ -199,132 +222,174 @@ protected final void crawl(final List requests) {
* Defines the workflow of the crawler.
*/
private void run() {
- onBegin();
+ onStart();
- while (!stopCrawling && crawlFrontier.hasNextCandidate()) {
- // Get the next crawl candidate from the queue
+ while (!isStopping && crawlFrontier.hasNextCandidate()) {
CrawlCandidate currentCandidate = crawlFrontier.getNextCandidate();
+ String candidateUrl = currentCandidate.getRequestUrl().toString();
+ HttpClientContext context = HttpClientContext.create();
+ HttpResponse httpHeadResponse = null;
+ boolean isUnsuccessfulRequest = false;
- URI currentCandidateUrl = currentCandidate.getCandidateUrl();
- String currentRequestUrlAsString = currentCandidateUrl.toString();
-
- HttpHeadResponse httpHeadResponse;
- URI responseUrl = currentCandidateUrl;
+ // Update the client's cookie store, so it will have the same state as the browser
+ updateClientCookieStore();
try {
- HttpClientContext context = HttpClientContext.create();
-
- // Send an HTTP HEAD request to the current URL to determine its availability and content type
- httpHeadResponse = getHttpHeadResponse(currentCandidateUrl, context);
+ // Send an HTTP HEAD request to determine its availability and content type
+ httpHeadResponse = getHttpHeadResponse(candidateUrl, context);
+ } catch (IOException exception) {
+ onRequestError(new RequestErrorEvent(currentCandidate, exception));
+ isUnsuccessfulRequest = true;
+ }
- // If the request has been redirected, get the final URL
+ if (!isUnsuccessfulRequest) {
+ String responseUrl = candidateUrl;
List redirectLocations = context.getRedirectLocations();
if (redirectLocations != null) {
- responseUrl = redirectLocations.get(redirectLocations.size() - 1);
+ // If the request was redirected, get the final URL
+ responseUrl = redirectLocations.get(redirectLocations.size() - 1).toString();
}
- } catch (IOException ex) {
- UnsuccessfulRequest unsuccessfulRequest = new UnsuccessfulRequestBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(),
- currentCandidate.getCrawlRequest())
- .setException(ex)
- .build();
-
- onUnsuccessfulRequest(unsuccessfulRequest);
- continue;
- }
-
- // If the request has been redirected, a new crawl request should be created for the redirected URL
- if (!responseUrl.toString().equals(currentRequestUrlAsString)) {
- CrawlRequest redirectedCrawlRequest = new CrawlRequestBuilder(responseUrl).setPriority(currentCandidate.getPriority()).build();
- crawlFrontier.feedRequest(redirectedCrawlRequest, false);
- continue;
+ if (!responseUrl.equals(candidateUrl)) {
+ // Create a new crawl request for the redirected URL
+ handleRequestRedirect(currentCandidate, responseUrl);
+ } else if (isContentHtml(httpHeadResponse)) {
+ boolean isTimedOut = false;
+
+ try {
+ // Open URL in browser
+ webDriver.get(candidateUrl);
+ } catch (TimeoutException exception) {
+ isTimedOut = true;
+ onPageLoadTimeout(new PageLoadTimeoutEvent(currentCandidate, exception));
+ }
+
+ if (!isTimedOut) {
+ String loadedPageUrl = webDriver.getCurrentUrl();
+ if (!loadedPageUrl.equals(candidateUrl)) {
+ // Create a new crawl request for the redirected URL (JS redirect)
+ handleRequestRedirect(currentCandidate, loadedPageUrl);
+ } else {
+ onPageLoad(new PageLoadEvent(currentCandidate, webDriver));
+ }
+ }
+ } else {
+ // URLs that point to non-HTML content should not be opened in the browser
+ onNonHtmlContent(new NonHtmlContentEvent(currentCandidate));
+ }
}
- // Check if the content of the response is HTML
- if (isContentHtml(httpHeadResponse)) {
- boolean timedOut = false;
-
- try {
- // Open the URL in the browser
- webDriver.get(currentRequestUrlAsString);
- } catch (TimeoutException ex) {
- timedOut = true;
- }
+ performDelay();
+ }
- HtmlResponse htmlResponse = new HtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(),
- currentCandidate.getCrawlRequest())
- .setHttpHeadResponse(httpHeadResponse)
- .setWebDriver(webDriver)
- .build();
+ onStop();
+ }
- // Check if the request has timed out
- if (!timedOut) {
- onResponseComplete(htmlResponse);
- } else {
- onResponseTimeout(htmlResponse);
+ /**
+ * Creates the crawl delay mechanism according to the configuration.
+ *
+ * @return the created crawl delay mechanism
+ */
+ @SuppressWarnings("checkstyle:MissingSwitchDefault")
+ private CrawlDelayMechanism createCrawlDelayMechanism() {
+ switch (config.getCrawlDelayStrategy()) {
+ case FIXED:
+ return new FixedCrawlDelayMechanism(config);
+ case RANDOM:
+ return new RandomCrawlDelayMechanism(config);
+ case ADAPTIVE:
+ AdaptiveCrawlDelayMechanism mechanism
+ = new AdaptiveCrawlDelayMechanism(config, (JavascriptExecutor) webDriver);
+ if (!mechanism.isBrowserCompatible()) {
+ throw new UnsupportedOperationException("The Navigation Timing API is not "
+ + "supported by the browser.");
}
- } else {
- // URLs that point to non-HTML content should not be opened in the browser
-
- NonHtmlResponse nonHtmlResponse = new NonHtmlResponseBuilder(currentCandidate.getRefererUrl(), currentCandidate.getCrawlDepth(),
- currentCandidate.getCrawlRequest())
- .setHttpHeadResponse(httpHeadResponse)
- .build();
-
- onNonHtmlResponse(nonHtmlResponse);
- }
- performDelay();
+ return mechanism;
}
- onFinish();
+ throw new IllegalArgumentException("Unsupported crawl delay strategy.");
}
/**
- * Returns a HTTP HEAD response for the given URL.
+ * Sends an HTTP HEAD request to the given URL and returns the response.
+ *
+ * @param destinationUrl the destination URL
*
- * @param destinationUrl The URL to crawl
- * @return The HTTP HEAD response
+ * @return the HTTP HEAD response
+ *
+ * @throws IOException if an error occurs while trying to fulfill the request
*/
- private HttpHeadResponse getHttpHeadResponse(final URI destinationUrl, final HttpClientContext context) throws IOException {
- HttpHead headRequest = new HttpHead(destinationUrl.toString());
- HttpResponse response = httpClient.execute(headRequest, context);
- return new HttpHeadResponse(response);
+ private HttpResponse getHttpHeadResponse(
+ final String destinationUrl,
+ final HttpClientContext context) throws IOException {
+ HttpHead headRequest = new HttpHead(destinationUrl);
+ return httpClient.execute(headRequest, context);
}
/**
- * Indicates if the content of the response is HTML or not.
+ * Indicates if the response's content type is HTML.
+ *
+ * @param httpHeadResponse the HTTP HEAD response
*
- * @param httpHeadResponse The HTTP HEAD response
- * @return true
if the content is HTML, false
- * otherwise
+ * @return true
if the content type is HTML, false
otherwise
*/
- private static boolean isContentHtml(final HttpHeadResponse httpHeadResponse) {
+ private static boolean isContentHtml(final HttpResponse httpHeadResponse) {
Header contentTypeHeader = httpHeadResponse.getFirstHeader("Content-Type");
return contentTypeHeader != null && contentTypeHeader.getValue().contains("text/html");
}
/**
- * Constructs the crawl delay mechanism specified in the configuration.
+ * Creates a crawl request for the redirected URL, feeds it to the crawler and calls the
+ * appropriate event callback.
*
- * @return The crawl delay mechanism
+ * @param currentCrawlCandidate the current crawl candidate
+ * @param redirectedUrl the URL of the redirected request
*/
- private CrawlDelayMechanism createCrawlDelayMechanism() {
- switch (config.getCrawlDelayStrategy()) {
- case FIXED:
- return new FixedCrawlDelayMechanism(config);
- case RANDOM:
- return new RandomCrawlDelayMechanism(config);
- case ADAPTIVE:
- AdaptiveCrawlDelayMechanism adaptiveCrawlDelay = new AdaptiveCrawlDelayMechanism(config, (JavascriptExecutor) webDriver);
- if (!adaptiveCrawlDelay.isBrowserCompatible()) {
- throw new UnsupportedOperationException("The Navigation Timing API is not supported by the browser.");
- }
+ private void handleRequestRedirect(
+ final CrawlCandidate currentCrawlCandidate,
+ final String redirectedUrl) {
+ CrawlRequestBuilder builder = new CrawlRequestBuilder(redirectedUrl)
+ .setPriority(currentCrawlCandidate.getPriority());
+ currentCrawlCandidate.getMetadata().ifPresent(builder::setMetadata);
+ CrawlRequest redirectedRequest = builder.build();
+
+ crawlFrontier.feedRequest(redirectedRequest, false);
+ onRequestRedirect(new RequestRedirectEvent(currentCrawlCandidate, redirectedRequest));
+ }
- return adaptiveCrawlDelay;
+ /**
+ * Adds all the browser cookies for the current domain to the HTTP client's cookie store,
+ * replacing any existing equivalent ones.
+ */
+ private void updateClientCookieStore() {
+ webDriver.manage()
+ .getCookies()
+ .stream()
+ .map(BaseCrawler::convertBrowserCookie)
+ .forEach(cookieStore::addCookie);
+ }
+
+ /**
+ * Converts a browser cookie to a HTTP client one.
+ *
+ * @param browserCookie the browser cookie to be converted
+ *
+ * @return the converted HTTP client cookie
+ */
+ private static BasicClientCookie convertBrowserCookie(final Cookie browserCookie) {
+ BasicClientCookie clientCookie
+ = new BasicClientCookie(browserCookie.getName(), browserCookie.getValue());
+ clientCookie.setDomain(browserCookie.getDomain());
+ clientCookie.setPath(browserCookie.getPath());
+ clientCookie.setExpiryDate(browserCookie.getExpiry());
+ clientCookie.setSecure(browserCookie.isSecure());
+
+ if (browserCookie.isHttpOnly()) {
+ clientCookie.setAttribute("httponly", StringUtils.EMPTY);
}
- throw new IllegalArgumentException("Unsupported crawl delay strategy.");
+ return clientCookie;
}
/**
@@ -335,54 +400,71 @@ private void performDelay() {
TimeUnit.MILLISECONDS.sleep(crawlDelayMechanism.getDelay());
} catch (InterruptedException ex) {
Thread.currentThread().interrupt();
- stopCrawling = true;
+ isStopping = true;
}
}
/**
- * Called when the crawler is about to begin its operation.
+ * Callback which gets called when the crawler is started.
+ */
+ protected void onStart() {
+ LOGGER.info("onStart");
+ }
+
+ /**
+ * Callback which gets called when the browser loads the page.
+ *
+ * @param event the PageLoadEvent
instance
*/
- protected void onBegin() {
+ protected void onPageLoad(final PageLoadEvent event) {
+ LOGGER.log(Level.INFO, "onPageLoad: {0}", event.getCrawlCandidate().getRequestUrl());
}
/**
- * Called after the browser loads the given URL.
+ * Callback which gets called when the content type is not HTML.
*
- * @param response The HTML response
+ * @param event the NonHtmlContentEvent
instance
*/
- protected void onResponseComplete(final HtmlResponse response) {
+ protected void onNonHtmlContent(final NonHtmlContentEvent event) {
+ LOGGER.log(Level.INFO, "onNonHtmlContent: {0}", event.getCrawlCandidate().getRequestUrl());
}
/**
- * Called when the loading of the given URL times out in the browser. Use
- * this callback with caution: the page might be half-loaded or not loaded
- * at all.
+ * Callback which gets called when a request error occurs.
*
- * @param response The HTML response
+ * @param event the RequestErrorEvent
instance
*/
- protected void onResponseTimeout(final HtmlResponse response) {
+ protected void onRequestError(final RequestErrorEvent event) {
+ LOGGER.log(Level.INFO, "onRequestError: {0}", event.getCrawlCandidate().getRequestUrl());
}
/**
- * Called when getting a non-HTML response.
+ * Callback which gets called when a request is redirected.
*
- * @param response The non-HTML response
+ * @param event the RequestRedirectEvent
instance
*/
- protected void onNonHtmlResponse(final NonHtmlResponse response) {
+ protected void onRequestRedirect(final RequestRedirectEvent event) {
+ LOGGER.log(Level.INFO, "onRequestRedirect: {0} -> {1}",
+ new Object[]{
+ event.getCrawlCandidate().getRequestUrl(),
+ event.getRedirectedCrawlRequest().getRequestUrl()
+ });
}
/**
- * Called when an exception occurs while sending an initial HEAD request to
- * the given URL.
+ * Callback which gets called when the page does not load in the browser within the timeout
+ * period.
*
- * @param request The unsuccessful request
+ * @param event the PageLoadTimeoutEvent
instance
*/
- protected void onUnsuccessfulRequest(final UnsuccessfulRequest request) {
+ protected void onPageLoadTimeout(final PageLoadTimeoutEvent event) {
+ LOGGER.log(Level.INFO, "onPageLoadTimeout: {0}", event.getCrawlCandidate().getRequestUrl());
}
/**
- * Called when the crawler successfully finishes its operation.
+ * Callback which gets called when the crawler is stopped.
*/
- protected void onFinish() {
+ protected void onStop() {
+ LOGGER.info("onStop");
}
}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java
similarity index 57%
rename from src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java
rename to src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java
index b5041b9..9b238d7 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlCandidate.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlCandidate.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2017 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,16 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.github.peterbencze.serritor.internal;
-import com.github.peterbencze.serritor.api.CrawlRequest;
+package com.github.peterbencze.serritor.api;
+
import com.google.common.net.InternetDomainName;
import java.io.Serializable;
import java.net.URI;
+import java.util.Optional;
/**
- * Represents a candidate for crawling that will be surely processed by the
- * crawler.
+ * Represents a candidate for crawling.
*
* @author Peter Bencze
*/
@@ -32,66 +32,69 @@ public final class CrawlCandidate implements Serializable {
private final int crawlDepth;
private final CrawlRequest crawlRequest;
- public CrawlCandidate(final CrawlCandidateBuilder builder) {
+ private CrawlCandidate(final CrawlCandidateBuilder builder) {
this.crawlRequest = builder.crawlRequest;
this.refererUrl = builder.refererUrl;
this.crawlDepth = builder.crawlDepth;
}
/**
- * Returns the referer's URL.
+ * Returns the referer URL.
*
- * @return The URL of the referer
+ * @return the URL of the referer
*/
public URI getRefererUrl() {
return refererUrl;
}
/**
- * Returns the candidate's URL.
+ * Returns the request URL.
*
- * @return The URL of the candidate
+ * @return the URL of the request
*/
- public URI getCandidateUrl() {
+ public URI getRequestUrl() {
return crawlRequest.getRequestUrl();
}
/**
- * Returns the domain of the candidate's URL.
+ * Returns the domain of the request URL.
*
- * @return The domain of the candidate URL
+ * @return the domain of the request URL
*/
public InternetDomainName getDomain() {
return crawlRequest.getDomain();
}
/**
- * Returns the crawl depth of the candidate.
+ * Returns the crawl depth of the request.
*
- * @return The crawl depth
+ * @return the crawl depth of the request
*/
public int getCrawlDepth() {
return crawlDepth;
}
/**
- * Returns the priority of the candidate.
+ * Returns the priority of the request.
*
- * @return The priority
+ * @return the priority of the request
*/
public int getPriority() {
return crawlRequest.getPriority();
}
/**
- * Returns the crawl request from which this candidate was constructed.
+ * Returns the metadata associated with the request.
*
- * @return The CrawlRequest
instance
+ * @return the metadata associated with the request
*/
- public CrawlRequest getCrawlRequest() {
- return crawlRequest;
+ public Optional getMetadata() {
+ return crawlRequest.getMetadata();
}
+ /**
+ * Builds {@link CrawlCandidate} instances.
+ */
public static final class CrawlCandidateBuilder {
private final CrawlRequest crawlRequest;
@@ -99,20 +102,44 @@ public static final class CrawlCandidateBuilder {
private URI refererUrl;
private int crawlDepth;
+ /**
+ * Creates a {@link CrawlCandidateBuilder} instance.
+ *
+ * @param request the CrawlRequest
instance from which this candidate is built
+ */
public CrawlCandidateBuilder(final CrawlRequest request) {
crawlRequest = request;
}
+ /**
+ * Sets the referer URL.
+ *
+ * @param refererUrl the referer URL
+ *
+ * @return the CrawlCandidateBuilder
instance
+ */
public CrawlCandidateBuilder setRefererUrl(final URI refererUrl) {
this.refererUrl = refererUrl;
return this;
}
+ /**
+ * Sets the crawl depth of the request.
+ *
+ * @param crawlDepth the crawl depth of the request
+ *
+ * @return the CrawlCandidateBuilder
instance
+ */
public CrawlCandidateBuilder setCrawlDepth(final int crawlDepth) {
this.crawlDepth = crawlDepth;
return this;
}
+ /**
+ * Builds the configured CrawlCandidate
instance.
+ *
+ * @return the configured CrawlCandidate
instance
+ */
public CrawlCandidate build() {
return new CrawlCandidate(this);
}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java
index 0c10e7b..4a80d8b 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlDelayStrategy.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2018 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,15 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package com.github.peterbencze.serritor.api;
/**
- * Available crawl delay strategies that can be used by the crawler.
- *
+ * Available crawl delay strategies which define how the delay between each request is determined.
+ *
* @author Peter Bencze
*/
public enum CrawlDelayStrategy {
-
+
FIXED,
ADAPTIVE,
RANDOM
diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
index 4188a54..f1c6e4a 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlRequest.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2017 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package com.github.peterbencze.serritor.api;
import com.google.common.net.InternetDomainName;
@@ -21,11 +22,11 @@
import java.io.Serializable;
import java.net.URI;
import java.util.Optional;
+import org.apache.commons.lang3.Validate;
/**
- * Represents a crawl request that might be processed by the crawler in the
- * future. The reason why it is not sure that it will be processed is because it
- * might get filtered out by one of the enabled filters.
+ * Represents a crawl request that may be completed by the crawler. If request filtering is enabled,
+ * it could get filtered out.
*
* @author Peter Bencze
*/
@@ -34,7 +35,7 @@ public final class CrawlRequest implements Serializable {
private final URI requestUrl;
private final int priority;
private final Serializable metadata;
-
+
private transient InternetDomainName domain;
private CrawlRequest(final CrawlRequestBuilder builder) {
@@ -45,57 +46,58 @@ private CrawlRequest(final CrawlRequestBuilder builder) {
}
/**
- * Returns the request's URL.
+ * Returns the request URL.
*
- * @return The URL of the request
+ * @return the request URL
*/
public URI getRequestUrl() {
return requestUrl;
}
/**
- * Returns the domain of the request's URL.
+ * Returns the domain of the request URL.
*
- * @return The domain of the request URL
+ * @return the domain of the request URL
*/
public InternetDomainName getDomain() {
return domain;
}
/**
- * Returns the request's priority.
+ * Returns the priority of the request.
*
- * @return The priority of the request
+ * @return the priority of the request
*/
public int getPriority() {
return priority;
}
/**
- * Returns metadata associated with the request.
+ * Returns the metadata associated with the request.
*
- * @return The request's metadata
+ * @return the metadata associated with the request
*/
public Optional getMetadata() {
return Optional.ofNullable(metadata);
}
+ /**
+ * Builds {@link CrawlRequest} instances.
+ */
public static final class CrawlRequestBuilder {
private static final int DEFAULT_PRIORITY = 0;
private final URI requestUrl;
private final InternetDomainName domain;
-
+
private int priority;
private Serializable metadata;
/**
- * Constructs a CrawlRequestBuilder
instance that can be
- * used to create CrawRequest instances.
+ * Creates a {@link CrawlRequestBuilder} instance.
*
- * @param requestUrl The request's URL given as a URL
- * instance
+ * @param requestUrl the request URL
*/
public CrawlRequestBuilder(final URI requestUrl) {
this.requestUrl = requestUrl;
@@ -108,22 +110,20 @@ public CrawlRequestBuilder(final URI requestUrl) {
}
/**
- * Constructs a CrawlRequestBuilder
instance that can be
- * used to create CrawRequest
instances.
+ * Creates a {@link CrawlRequestBuilder} instance.
*
- * @param requestUrl The request's URL given as a String
- * instance
+ * @param requestUrl the request URL
*/
public CrawlRequestBuilder(final String requestUrl) {
this(URI.create(requestUrl));
}
/**
- * Sets the request's priority.
+ * Sets the priority of the request.
+ *
+ * @param priority the priority of the request (higher number means higher priority)
*
- * @param priority The priority of the request (higher number means
- * higher priority)
- * @return The CrawlRequestBuilder
instance
+ * @return the CrawlRequestBuilder
instance
*/
public CrawlRequestBuilder setPriority(final int priority) {
this.priority = priority;
@@ -131,21 +131,21 @@ public CrawlRequestBuilder setPriority(final int priority) {
}
/**
- * Sets additional metadata for the request which can be later accessed
- * when the crawler processed the request.
+ * Sets the metadata associated with the request.
*
- * @param metadata The metadata associated with the request
- * @return The CrawlRequestBuilder
instance
+ * @param metadata the metadata associated with the request
+ *
+ * @return the CrawlRequestBuilder
instance
*/
public CrawlRequestBuilder setMetadata(final Serializable metadata) {
- this.metadata = metadata;
+ this.metadata = Validate.notNull(metadata, "The metadata cannot be null.");
return this;
}
/**
* Builds the configured CrawlRequest
instance.
*
- * @return The configured CrawlRequest
instance
+ * @return the configured CrawlRequest
instance
*/
public CrawlRequest build() {
return new CrawlRequest(this);
@@ -154,7 +154,7 @@ public CrawlRequest build() {
private void readObject(final ObjectInputStream in) throws IOException, ClassNotFoundException {
in.defaultReadObject();
-
+
domain = InternetDomainName.from(requestUrl.getHost());
}
}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java
index c88435b..e449892 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlStrategy.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2017 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,10 +13,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package com.github.peterbencze.serritor.api;
/**
- * Available strategies that can be used while crawling.
+ * Available crawl strategies that define the order in which crawl requests are processed.
*
* @author Peter Bencze
*/
diff --git a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java
index 8cdaa71..d5aef15 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/CrawlerConfiguration.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2017 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package com.github.peterbencze.serritor.api;
import com.github.peterbencze.serritor.internal.CrawlDomain;
@@ -25,7 +26,7 @@
import org.apache.commons.lang3.Validate;
/**
- * This class contains the settings of the crawler.
+ * Contains the settings of the crawler.
*
* @author Peter Bencze
*/
@@ -58,7 +59,7 @@ private CrawlerConfiguration(final CrawlerConfigurationBuilder builder) {
/**
* Returns the set of allowed crawl domains.
*
- * @return The set of allowed crawl domains
+ * @return the set of allowed crawl domains
*/
public Set getAllowedCrawlDomains() {
return allowedCrawlDomains;
@@ -67,7 +68,7 @@ public Set getAllowedCrawlDomains() {
/**
* Returns the set of crawl seeds.
*
- * @return The set of crawl seeds
+ * @return the set of crawl seeds
*/
public Set getCrawlSeeds() {
return crawlSeeds;
@@ -76,14 +77,14 @@ public Set getCrawlSeeds() {
/**
* Returns the crawl strategy of the crawler.
*
- * @return The crawl strategy
+ * @return the crawl strategy of the crawler
*/
public CrawlStrategy getCrawlStrategy() {
return crawlStrategy;
}
/**
- * Indicates if duplicate request filtering is enabled or not.
+ * Indicates if duplicate request filtering is enabled.
*
* @return true
if enabled, false
otherwise
*/
@@ -92,7 +93,7 @@ public boolean isDuplicateRequestFilteringEnabled() {
}
/**
- * Indicates if offsite request filtering is enabled or not.
+ * Indicates if offsite request filtering is enabled.
*
* @return true
if enabled, false
otherwise
*/
@@ -101,18 +102,18 @@ public boolean isOffsiteRequestFilteringEnabled() {
}
/**
- * Returns the maximum possible crawl depth.
+ * Returns the maximum crawl depth.
*
- * @return The maximum crawl depth
+ * @return the maximum crawl depth
*/
public int getMaximumCrawlDepth() {
return maxCrawlDepth;
}
/**
- * Returns the crawl delay strategy used by the crawler.
+ * Returns the crawl delay strategy of the crawler.
*
- * @return The crawl delay strategy
+ * @return the crawl delay strategy of the crawler
*/
public CrawlDelayStrategy getCrawlDelayStrategy() {
return crawlDelayStrategy;
@@ -121,7 +122,7 @@ public CrawlDelayStrategy getCrawlDelayStrategy() {
/**
* Returns the exact duration of delay between each request.
*
- * @return The duration of delay in milliseconds
+ * @return the duration of delay in milliseconds
*/
public long getFixedCrawlDelayDurationInMillis() {
return fixedCrawlDelayDurationInMillis;
@@ -130,7 +131,7 @@ public long getFixedCrawlDelayDurationInMillis() {
/**
* Returns the minimum duration of delay between each request.
*
- * @return The minimum duration of delay in milliseconds
+ * @return the minimum duration of delay in milliseconds
*/
public long getMinimumCrawlDelayDurationInMillis() {
return minCrawlDelayDurationInMillis;
@@ -139,12 +140,15 @@ public long getMinimumCrawlDelayDurationInMillis() {
/**
* Returns the maximum duration of delay between each request.
*
- * @return The maximum duration of delay in milliseconds
+ * @return the maximum duration of delay in milliseconds
*/
public long getMaximumCrawlDelayDurationInMillis() {
return maxCrawlDelayDurationInMillis;
}
+ /**
+ * Builds {@link CrawlerConfiguration} instances.
+ */
public static final class CrawlerConfigurationBuilder {
private static final CrawlStrategy DEFAULT_CRAWL_STRATEGY = CrawlStrategy.BREADTH_FIRST;
@@ -152,9 +156,12 @@ public static final class CrawlerConfigurationBuilder {
private static final boolean FILTER_OFFSITE_REQUESTS_BY_DEFAULT = false;
private static final int DEFAULT_MAX_CRAWL_DEPTH = 0;
private static final CrawlDelayStrategy DEFAULT_CRAWL_DELAY = CrawlDelayStrategy.FIXED;
- private static final long DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS = Duration.ZERO.toMillis();
- private static final long DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS = Duration.ofSeconds(1).toMillis();
- private static final long DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS = Duration.ofMinutes(1).toMillis();
+ private static final long DEFAULT_FIXED_CRAWL_DELAY_IN_MILLIS
+ = Duration.ZERO.toMillis();
+ private static final long DEFAULT_MIN_CRAWL_DELAY_IN_MILLIS
+ = Duration.ofSeconds(1).toMillis();
+ private static final long DEFAULT_MAX_CRAWL_DELAY_IN_MILLIS
+ = Duration.ofMinutes(1).toMillis();
private final Set allowedCrawlDomains;
private final Set crawlSeeds;
@@ -168,6 +175,9 @@ public static final class CrawlerConfigurationBuilder {
private long minCrawlDelayDurationInMillis;
private long maxCrawlDelayDurationInMillis;
+ /**
+ * Creates a {@link CrawlerConfigurationBuilder} instance.
+ */
public CrawlerConfigurationBuilder() {
// Initialize with default values
allowedCrawlDomains = new HashSet<>();
@@ -185,27 +195,30 @@ public CrawlerConfigurationBuilder() {
/**
* Appends an internet domain to the list of allowed crawl domains.
*
- * @param allowedCrawlDomain A well-formed internet domain name
- * @return The CrawlerConfigurationBuilder
instance
+ * @param allowedCrawlDomain a well-formed internet domain name
+ *
+ * @return the CrawlerConfigurationBuilder
instance
*/
public CrawlerConfigurationBuilder addAllowedCrawlDomain(final String allowedCrawlDomain) {
InternetDomainName domain = InternetDomainName.from(allowedCrawlDomain);
- Validate.isTrue(domain.isUnderPublicSuffix(), String.format("The domain (\"%s\") is not under public suffix.", allowedCrawlDomain));
+ Validate.isTrue(domain.isUnderPublicSuffix(),
+ String.format("The domain (\"%s\") is not under public suffix.",
+ allowedCrawlDomain));
allowedCrawlDomains.add(new CrawlDomain(domain));
return this;
}
/**
- * Appends a list of internet domains to the list of allowed crawl
- * domains.
+ * Appends a list of internet domains to the list of allowed crawl domains.
+ *
+ * @param allowedCrawlDomains a list of well-formed internet domain names
*
- * @param allowedCrawlDomains A list of well-formed internet domain
- * names
- * @return The CrawlerConfigurationBuilder
instance
+ * @return the CrawlerConfigurationBuilder
instance
*/
- public CrawlerConfigurationBuilder addAllowedCrawlDomains(final List allowedCrawlDomains) {
+ public CrawlerConfigurationBuilder addAllowedCrawlDomains(
+ final List allowedCrawlDomains) {
allowedCrawlDomains.forEach(this::addAllowedCrawlDomain);
return this;
}
@@ -213,9 +226,9 @@ public CrawlerConfigurationBuilder addAllowedCrawlDomains(final List all
/**
* Appends a crawl request to the set of crawl seeds.
*
- * @param request The CrawlRequest
instance which
- * represents the crawl seed
- * @return The CrawlerConfigurationBuilder
instance
+ * @param request the crawl request which represents a crawl seed
+ *
+ * @return the CrawlerConfigurationBuilder
instance
*/
public CrawlerConfigurationBuilder addCrawlSeed(final CrawlRequest request) {
Validate.notNull(request, "The request cannot be null.");
@@ -227,9 +240,9 @@ public CrawlerConfigurationBuilder addCrawlSeed(final CrawlRequest request) {
/**
* Appends a list of crawl requests to the set of crawl seeds.
*
- * @param requests The list of CrawlRequest
instances which
- * represent the crawl seeds
- * @return The CrawlerConfigurationBuilder
instance
+ * @param requests the list of crawl requests which represent crawl seeds
+ *
+ * @return the CrawlerConfigurationBuilder
instance
*/
public CrawlerConfigurationBuilder addCrawlSeeds(final List requests) {
requests.forEach(this::addCrawlSeed);
@@ -237,12 +250,13 @@ public CrawlerConfigurationBuilder addCrawlSeeds(final List reques
}
/**
- * Sets the crawl strategy to be used by the crawler. Breadth-first
- * strategy orders crawl requests by the lowest crawl depth, whereas
- * depth-first orders them by the highest crawl depth.
+ * Sets the crawl strategy to be used by the crawler. Breadth-first strategy orders crawl
+ * requests by the lowest crawl depth, whereas depth-first orders them by the highest crawl
+ * depth.
*
- * @param strategy The crawl strategy
- * @return The CrawlerConfigurationBuilder
instance
+ * @param strategy the crawl strategy
+ *
+ * @return the CrawlerConfigurationBuilder
instance
*/
public CrawlerConfigurationBuilder setCrawlStrategy(final CrawlStrategy strategy) {
Validate.notNull(strategy, "The strategy cannot be null.");
@@ -254,11 +268,13 @@ public CrawlerConfigurationBuilder setCrawlStrategy(final CrawlStrategy strategy
/**
* Enables or disables duplicate request filtering.
*
- * @param filterDuplicateRequests true
means enabled,
- * false
means disabled
- * @return The CrawlerConfigurationBuilder
instance
+ * @param filterDuplicateRequests true
means enabled, false
means
+ * disabled
+ *
+ * @return the CrawlerConfigurationBuilder
instance
*/
- public CrawlerConfigurationBuilder setDuplicateRequestFiltering(final boolean filterDuplicateRequests) {
+ public CrawlerConfigurationBuilder setDuplicateRequestFiltering(
+ final boolean filterDuplicateRequests) {
this.filterDuplicateRequests = filterDuplicateRequests;
return this;
}
@@ -266,21 +282,23 @@ public CrawlerConfigurationBuilder setDuplicateRequestFiltering(final boolean fi
/**
* Enables or disables offsite request filtering.
*
- * @param filterOffsiteRequests true
means enabled,
- * false
means disabled
- * @return The CrawlerConfigurationBuilder
instance
+ * @param filterOffsiteRequests true
means enabled, false
means
+ * disabled
+ *
+ * @return the CrawlerConfigurationBuilder
instance
*/
- public CrawlerConfigurationBuilder setOffsiteRequestFiltering(final boolean filterOffsiteRequests) {
+ public CrawlerConfigurationBuilder setOffsiteRequestFiltering(
+ final boolean filterOffsiteRequests) {
this.filterOffsiteRequests = filterOffsiteRequests;
return this;
}
/**
- * Sets the maximum possible crawl depth. It should be a non-negative
- * number where 0 means there is no limit.
+ * Sets the maximum crawl depth. It should be a non-negative number (0 means no limit).
+ *
+ * @param maxCrawlDepth the maximum crawl depth
*
- * @param maxCrawlDepth The maximum crawl depth
- * @return The CrawlerConfigurationBuilder
instance
+ * @return the CrawlerConfigurationBuilder
instance
*/
public CrawlerConfigurationBuilder setMaximumCrawlDepth(final int maxCrawlDepth) {
Validate.isTrue(maxCrawlDepth >= 0, "The maximum crawl depth cannot be negative.");
@@ -290,12 +308,15 @@ public CrawlerConfigurationBuilder setMaximumCrawlDepth(final int maxCrawlDepth)
}
/**
- * Sets the crawl delay strategy to be used by the crawler.
+ * Sets the crawl delay strategy to be used by the crawler. This strategy defines how the
+ * delay between each request is determined.
*
- * @param strategy The crawl delay strategy
- * @return The CrawlerConfigurationBuilder
instance
+ * @param strategy the crawl delay strategy
+ *
+ * @return the CrawlerConfigurationBuilder
instance
*/
- public CrawlerConfigurationBuilder setCrawlDelayStrategy(final CrawlDelayStrategy strategy) {
+ public CrawlerConfigurationBuilder setCrawlDelayStrategy(
+ final CrawlDelayStrategy strategy) {
Validate.notNull(strategy, "The strategy cannot be null.");
crawlDelayStrategy = strategy;
@@ -305,10 +326,12 @@ public CrawlerConfigurationBuilder setCrawlDelayStrategy(final CrawlDelayStrateg
/**
* Sets the exact duration of delay between each request.
*
- * @param fixedCrawlDelayDuration The duration of delay
- * @return The CrawlerConfigurationBuilder
instance
+ * @param fixedCrawlDelayDuration the duration of delay
+ *
+ * @return the CrawlerConfigurationBuilder
instance
*/
- public CrawlerConfigurationBuilder setFixedCrawlDelayDuration(final Duration fixedCrawlDelayDuration) {
+ public CrawlerConfigurationBuilder setFixedCrawlDelayDuration(
+ final Duration fixedCrawlDelayDuration) {
Validate.notNull(fixedCrawlDelayDuration, "The duration cannot be null.");
fixedCrawlDelayDurationInMillis = fixedCrawlDelayDuration.toMillis();
@@ -318,16 +341,20 @@ public CrawlerConfigurationBuilder setFixedCrawlDelayDuration(final Duration fix
/**
* Sets the minimum duration of delay between each request.
*
- * @param minCrawlDelayDuration The minimum duration of delay
- * @return The CrawlerConfigurationBuilder
instance
+ * @param minCrawlDelayDuration the minimum duration of delay
+ *
+ * @return the CrawlerConfigurationBuilder
instance
*/
- public CrawlerConfigurationBuilder setMinimumCrawlDelayDuration(final Duration minCrawlDelayDuration) {
+ public CrawlerConfigurationBuilder setMinimumCrawlDelayDuration(
+ final Duration minCrawlDelayDuration) {
Validate.notNull(minCrawlDelayDuration, "The duration cannot be null.");
- Validate.isTrue(!minCrawlDelayDuration.isNegative(), "The minimum crawl delay cannot be negative.");
+ Validate.isTrue(!minCrawlDelayDuration.isNegative(),
+ "The minimum crawl delay cannot be negative.");
long minDelayDurationInMillis = minCrawlDelayDuration.toMillis();
- Validate.isTrue(minDelayDurationInMillis < maxCrawlDelayDurationInMillis, "The minimum crawl delay should be less than the maximum.");
+ Validate.isTrue(minDelayDurationInMillis < maxCrawlDelayDurationInMillis,
+ "The minimum crawl delay should be less than the maximum.");
minCrawlDelayDurationInMillis = minDelayDurationInMillis;
return this;
@@ -336,15 +363,18 @@ public CrawlerConfigurationBuilder setMinimumCrawlDelayDuration(final Duration m
/**
* Sets the maximum duration of delay between each request.
*
- * @param maxCrawlDelayDuration The maximum duration of delay
- * @return The CrawlerConfigurationBuilder
instance
+ * @param maxCrawlDelayDuration the maximum duration of delay
+ *
+ * @return the CrawlerConfigurationBuilder
instance
*/
- public CrawlerConfigurationBuilder setMaximumCrawlDelayDuration(final Duration maxCrawlDelayDuration) {
+ public CrawlerConfigurationBuilder setMaximumCrawlDelayDuration(
+ final Duration maxCrawlDelayDuration) {
Validate.notNull(maxCrawlDelayDuration, "The duration cannot be null.");
long maxDelayDurationInMillis = maxCrawlDelayDuration.toMillis();
- Validate.isTrue(maxDelayDurationInMillis > minCrawlDelayDurationInMillis, "The maximum crawl delay should be higher than the minimum.");
+ Validate.isTrue(maxDelayDurationInMillis > minCrawlDelayDurationInMillis,
+ "The maximum crawl delay should be higher than the minimum.");
maxCrawlDelayDurationInMillis = maxDelayDurationInMillis;
return this;
@@ -353,7 +383,7 @@ public CrawlerConfigurationBuilder setMaximumCrawlDelayDuration(final Duration m
/**
* Builds the configured CrawlerConfiguration
instance.
*
- * @return The configured CrawlerConfiguration
instance
+ * @return the configured CrawlerConfiguration
instance
*/
public CrawlerConfiguration build() {
return new CrawlerConfiguration(this);
diff --git a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java
deleted file mode 100644
index 442d493..0000000
--- a/src/main/java/com/github/peterbencze/serritor/api/HtmlResponse.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright 2017 Peter Bencze.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.github.peterbencze.serritor.api;
-
-import com.github.peterbencze.serritor.internal.CallbackParameter;
-import java.net.URI;
-import org.openqa.selenium.WebDriver;
-
-/**
- * Represents an HTML response.
- *
- * @author Peter Bencze
- */
-public final class HtmlResponse extends CallbackParameter {
-
- private final HttpHeadResponse httpHeadResponse;
- private final WebDriver webDriver;
-
- private HtmlResponse(final HtmlResponseBuilder builder) {
- super(builder);
-
- httpHeadResponse = builder.httpHeadResponse;
- webDriver = builder.webDriver;
- }
-
- /**
- * Returns the HTTP HEAD response.
- *
- * @return The HTTP HEAD response
- */
- public HttpHeadResponse getHttpHeadResponse() {
- return httpHeadResponse;
- }
-
- /**
- * Returns the WebDriver
instance for the browser.
- *
- * @return The WebDriver
instance
- */
- public WebDriver getWebDriver() {
- return webDriver;
- }
-
- public static final class HtmlResponseBuilder extends CallbackParameterBuilder {
-
- private HttpHeadResponse httpHeadResponse;
- private WebDriver webDriver;
-
- public HtmlResponseBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
- super(refererUrl, crawlDepth, crawlRequest);
- }
-
- public HtmlResponseBuilder setHttpHeadResponse(final HttpHeadResponse httpHeadResponse) {
- this.httpHeadResponse = httpHeadResponse;
- return this;
- }
-
- public HtmlResponseBuilder setWebDriver(final WebDriver webDriver) {
- this.webDriver = webDriver;
- return this;
- }
-
- @Override
- public HtmlResponse build() {
- return new HtmlResponse(this);
- }
- }
-}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java b/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java
deleted file mode 100644
index 93f2aed..0000000
--- a/src/main/java/com/github/peterbencze/serritor/api/HttpHeadResponse.java
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright 2017 Peter Bencze.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.github.peterbencze.serritor.api;
-
-import java.util.Locale;
-import org.apache.http.Header;
-import org.apache.http.HeaderIterator;
-import org.apache.http.HttpResponse;
-import org.apache.http.ProtocolVersion;
-import org.apache.http.StatusLine;
-
-/**
- * Represents a response of a HTTP HEAD request.
- *
- * @author Peter Bencze
- */
-public final class HttpHeadResponse {
-
- private final HttpResponse response;
-
- public HttpHeadResponse(final HttpResponse response) {
- this.response = response;
- }
-
- /**
- * Checks if a certain header is present in this message.
- *
- * @param name The name of the header
- * @return true
if present, false
otherwise
- */
- public boolean containsHeader(final String name) {
- return response.containsHeader(name);
- }
-
- /**
- * Returns all the headers of this response.
- *
- * @return The array of headers
- */
- public Header[] getAllHeaders() {
- return response.getAllHeaders();
- }
-
- /**
- * Returns the first header with a specified name of this response.
- *
- * @param name The name of the header
- * @return The first header with the specified name
- */
- public Header getFirstHeader(final String name) {
- return response.getFirstHeader(name);
- }
-
- /**
- * Returns all the headers with a specified name of this response.
- *
- * @param name The name of the headers
- * @return The array of headers
- */
- public Header[] getHeaders(final String name) {
- return response.getHeaders(name);
- }
-
- /**
- * Returns the last header with a specified name of this response.
- *
- * @param name The name of the header
- * @return The last header with a specified name
- */
- public Header getLastHeader(final String name) {
- return response.getLastHeader(name);
- }
-
- /**
- * Returns the protocol version this response is compatible with.
- *
- * @return The compatible protocol version
- */
- public ProtocolVersion getProtocolVersion() {
- return response.getProtocolVersion();
- }
-
- /**
- * Returns an iterator of all the headers.
- *
- * @return An iterator of all the headers
- */
- public HeaderIterator headerIterator() {
- return response.headerIterator();
- }
-
- /**
- * Returns an iterator of the headers with a given name.
- *
- * @param name The name of the headers
- * @return An iterator of the headers with a given name
- */
- public HeaderIterator headerIterator(final String name) {
- return response.headerIterator(name);
- }
-
- /**
- * Obtains the locale of this response.
- *
- * @return The locale of this response
- */
- public Locale getLocale() {
- return response.getLocale();
- }
-
- /**
- * Obtains the status line of this response.
- *
- * @return The status line of this response
- */
- public StatusLine getStatusLine() {
- return response.getStatusLine();
- }
-}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java b/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java
deleted file mode 100644
index 0d3e6cf..0000000
--- a/src/main/java/com/github/peterbencze/serritor/api/NonHtmlResponse.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright 2017 Peter Bencze.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.github.peterbencze.serritor.api;
-
-import com.github.peterbencze.serritor.internal.CallbackParameter;
-import java.io.File;
-import java.io.IOException;
-import java.net.URI;
-import org.apache.commons.io.FileUtils;
-
-/**
- * Represents a non-HTML response.
- *
- * @author Peter Bencze
- */
-public final class NonHtmlResponse extends CallbackParameter {
-
- private final HttpHeadResponse httpHeadResponse;
-
- private NonHtmlResponse(final NonHtmlResponseBuilder builder) {
- super(builder);
-
- httpHeadResponse = builder.httpHeadResponse;
- }
-
- /**
- * Returns the HTTP HEAD response.
- *
- * @return The HTTP HEAD response
- */
- public HttpHeadResponse getHttpHeadResponse() {
- return httpHeadResponse;
- }
-
- /**
- * Downloads the file specified by the request URL.
- *
- * @param destination The destination File
instance
- * @throws IOException If the URL cannot be opened or I/O error occurs while downloading the file
- */
- public void downloadFile(final File destination) throws IOException {
- FileUtils.copyURLToFile(getCrawlRequest().getRequestUrl().toURL(), destination);
- }
-
- public static final class NonHtmlResponseBuilder extends CallbackParameterBuilder {
-
- private HttpHeadResponse httpHeadResponse;
-
- public NonHtmlResponseBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
- super(refererUrl, crawlDepth, crawlRequest);
- }
-
- public NonHtmlResponseBuilder setHttpHeadResponse(final HttpHeadResponse httpHeadResponse) {
- this.httpHeadResponse = httpHeadResponse;
- return this;
- }
-
- @Override
- public NonHtmlResponse build() {
- return new NonHtmlResponse(this);
- }
- }
-}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java b/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java
deleted file mode 100644
index 7d379d5..0000000
--- a/src/main/java/com/github/peterbencze/serritor/api/UnsuccessfulRequest.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright 2017 Peter Bencze.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.github.peterbencze.serritor.api;
-
-import com.github.peterbencze.serritor.internal.CallbackParameter;
-import java.io.IOException;
-import java.net.URI;
-
-/**
- * Represents an unsuccessful request.
- *
- * @author Peter Bencze
- */
-public final class UnsuccessfulRequest extends CallbackParameter {
-
- private final IOException exception;
-
- private UnsuccessfulRequest(final UnsuccessfulRequestBuilder builder) {
- super(builder);
-
- exception = builder.exception;
- }
-
- /**
- * Returns the exception that was thrown while trying to fulfill the
- * request.
- *
- * @return The IOException
instance
- */
- public IOException getException() {
- return exception;
- }
-
- public static final class UnsuccessfulRequestBuilder extends CallbackParameterBuilder {
-
- private IOException exception;
-
- public UnsuccessfulRequestBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
- super(refererUrl, crawlDepth, crawlRequest);
- }
-
- public UnsuccessfulRequestBuilder setException(final IOException exception) {
- this.exception = exception;
- return this;
- }
-
- @Override
- public UnsuccessfulRequest build() {
- return new UnsuccessfulRequest(this);
- }
- }
-}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java
new file mode 100644
index 0000000..932df11
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2017 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.peterbencze.serritor.api.event;
+
+import com.github.peterbencze.serritor.api.CrawlCandidate;
+import com.github.peterbencze.serritor.internal.EventObject;
+import java.io.File;
+import java.io.IOException;
+import org.apache.commons.io.FileUtils;
+
+/**
+ * Event which gets delivered when the content type is not HTML.
+ *
+ * @author Peter Bencze
+ */
+public final class NonHtmlContentEvent extends EventObject {
+
+ /**
+ * Creates a {@link NonHtmlContentEvent} instance.
+ *
+ * @param crawlCandidate the current crawl candidate
+ */
+ public NonHtmlContentEvent(final CrawlCandidate crawlCandidate) {
+ super(crawlCandidate);
+ }
+
+ /**
+ * Downloads the file specified by the URL.
+ *
+ * @param destination the destination file
+ *
+ * @throws IOException if the URL cannot be opened or I/O error occurs while downloading the
+ * file
+ */
+ public void downloadFile(final File destination) throws IOException {
+ FileUtils.copyURLToFile(getCrawlCandidate().getRequestUrl().toURL(), destination);
+ }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java
new file mode 100644
index 0000000..d83e394
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2017 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.peterbencze.serritor.api.event;
+
+import com.github.peterbencze.serritor.api.CrawlCandidate;
+import com.github.peterbencze.serritor.internal.EventObject;
+import org.openqa.selenium.WebDriver;
+
+/**
+ * Event which gets delivered when the browser loads the page.
+ *
+ * @author Peter Bencze
+ */
+public final class PageLoadEvent extends EventObject {
+
+ private final WebDriver webDriver;
+
+ /**
+ * Creates a {@link PageLoadEvent} instance.
+ *
+ * @param crawlCandidate the current crawl candidate
+ * @param webDriver the WebDriver
to control the browser
+ */
+ public PageLoadEvent(final CrawlCandidate crawlCandidate, final WebDriver webDriver) {
+ super(crawlCandidate);
+
+ this.webDriver = webDriver;
+ }
+
+ /**
+ * Returns the WebDriver
to control the browser.
+ *
+ * @return the WebDriver
to control the browser
+ */
+ public WebDriver getWebDriver() {
+ return webDriver;
+ }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java
new file mode 100644
index 0000000..e6c3e16
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.peterbencze.serritor.api.event;
+
+import com.github.peterbencze.serritor.api.CrawlCandidate;
+import com.github.peterbencze.serritor.internal.EventObject;
+import org.openqa.selenium.TimeoutException;
+
+/**
+ * Event which gets delivered when a page does not load in the browser within the timeout period.
+ *
+ * @author Peter Bencze
+ */
+public final class PageLoadTimeoutEvent extends EventObject {
+
+ private final TimeoutException exception;
+
+ /**
+ * Creates a {@link PageLoadTimeoutEvent} instance.
+ *
+ * @param crawlCandidate the current crawl candidate
+ * @param exception the thrown exception
+ */
+ public PageLoadTimeoutEvent(final CrawlCandidate crawlCandidate,
+ final TimeoutException exception) {
+ super(crawlCandidate);
+
+ this.exception = exception;
+ }
+
+ /**
+ * Returns the thrown exception.
+ *
+ * @return the thrown exception
+ */
+ public TimeoutException getException() {
+ return exception;
+ }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java
new file mode 100644
index 0000000..bbce9b3
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2017 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.peterbencze.serritor.api.event;
+
+import com.github.peterbencze.serritor.api.CrawlCandidate;
+import com.github.peterbencze.serritor.internal.EventObject;
+import java.io.IOException;
+
+/**
+ * Event which gets delivered when a request error occurs.
+ *
+ * @author Peter Bencze
+ */
+public final class RequestErrorEvent extends EventObject {
+
+ private final IOException exception;
+
+ /**
+ * Creates a {@link RequestErrorEvent} instance.
+ *
+ * @param crawlCandidate the current crawl candidate
+ * @param exception the thrown exception
+ */
+ public RequestErrorEvent(final CrawlCandidate crawlCandidate, final IOException exception) {
+ super(crawlCandidate);
+
+ this.exception = exception;
+ }
+
+ /**
+ * Returns the thrown exception.
+ *
+ * @return the thrown exception
+ */
+ public IOException getException() {
+ return exception;
+ }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java
new file mode 100644
index 0000000..188ba3e
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.peterbencze.serritor.api.event;
+
+import com.github.peterbencze.serritor.api.CrawlCandidate;
+import com.github.peterbencze.serritor.api.CrawlRequest;
+import com.github.peterbencze.serritor.internal.EventObject;
+
+/**
+ * Event which gets delivered when a request is redirected.
+ *
+ * @author Peter Bencze
+ */
+public final class RequestRedirectEvent extends EventObject {
+
+ private final CrawlRequest redirectedCrawlRequest;
+
+ /**
+ * Creates a {@link RequestRedirectEvent} instance.
+ *
+ * @param crawlCandidate the current crawl candidate
+ * @param redirectedCrawlRequest the crawl request for the redirected URL
+ */
+ public RequestRedirectEvent(final CrawlCandidate crawlCandidate,
+ final CrawlRequest redirectedCrawlRequest) {
+ super(crawlCandidate);
+
+ this.redirectedCrawlRequest = redirectedCrawlRequest;
+ }
+
+ /**
+ * Returns the crawl request for the redirected URL.
+ *
+ * @return the crawl request for the redirected URL
+ */
+ public CrawlRequest getRedirectedCrawlRequest() {
+ return redirectedCrawlRequest;
+ }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java
index 24ca816..65ec266 100644
--- a/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java
+++ b/src/main/java/com/github/peterbencze/serritor/api/helper/UrlFinder.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2018 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,9 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package com.github.peterbencze.serritor.api.helper;
-import com.github.peterbencze.serritor.api.HtmlResponse;
+import com.github.peterbencze.serritor.api.event.PageLoadEvent;
import com.google.common.collect.Sets;
import com.google.common.net.InternetDomainName;
import java.net.URI;
@@ -24,7 +25,7 @@
import java.util.HashSet;
import java.util.List;
import java.util.Set;
-import java.util.function.Function;
+import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@@ -34,8 +35,7 @@
import org.openqa.selenium.WebElement;
/**
- * A helper class which can be used to find URLs in HTML sources using regular
- * expressions.
+ * Finds URLs in HTML page sources using regular expressions.
*
* @author Peter Bencze
*/
@@ -44,7 +44,7 @@ public final class UrlFinder {
private final Set urlPatterns;
private final Set locatingMechanisms;
private final Set attributes;
- private final Function validator;
+ private final Predicate validator;
private UrlFinder(final UrlFinderBuilder builder) {
urlPatterns = builder.urlPatterns;
@@ -54,17 +54,18 @@ private UrlFinder(final UrlFinderBuilder builder) {
}
/**
- * Returns a list of validated URLs found in the response's HTML source.
+ * Returns a list of validated URLs found in the page's HTML source.
+ *
+ * @param event the PageLoadEvent
instance
*
- * @param response The HtmlResponse
instance
- * @return The list of found URLs
+ * @return the list of found URLs
*/
- public List findUrlsInResponse(final HtmlResponse response) {
+ public List findUrlsInPage(final PageLoadEvent event) {
Set foundUrls = new HashSet<>();
// Find elements using the specified locating mechanisms
Set extractedElements = locatingMechanisms.stream()
- .map(response.getWebDriver()::findElements)
+ .map(event.getWebDriver()::findElements)
.flatMap(List::stream)
.collect(Collectors.toSet());
@@ -85,8 +86,9 @@ public List findUrlsInResponse(final HtmlResponse response) {
/**
* Returns a list of validated URLs found in the attribute's value.
*
- * @param attributeValue The value of the attribute
- * @return The list of found URLs
+ * @param attributeValue the value of the attribute
+ *
+ * @return the list of found URLs
*/
private List findUrlsInAttributeValue(final String attributeValue) {
List foundUrls = new ArrayList<>();
@@ -97,7 +99,7 @@ private List findUrlsInAttributeValue(final String attributeValue) {
while (urlPatternMatcher.find()) {
String foundUrl = urlPatternMatcher.group().trim();
- if (validator.apply(foundUrl)) {
+ if (validator.test(foundUrl)) {
foundUrls.add(foundUrl);
}
}
@@ -106,34 +108,34 @@ private List findUrlsInAttributeValue(final String attributeValue) {
return foundUrls;
}
+ /**
+ * Builds {@link UrlFinder} instances.
+ */
public static final class UrlFinderBuilder {
-
+
private static final Set DEFAULT_LOCATING_MECHANISMS = Sets.newHashSet(By.tagName("a"));
private static final Set DEFAULT_ATTRIBUTES = Sets.newHashSet("href");
- private static final Function DEFAULT_VALIDATOR = UrlFinderBuilder::isValidUrl;
+ private static final Predicate DEFAULT_VALIDATOR = UrlFinderBuilder::isValidUrl;
private final Set urlPatterns;
private Set locatingMechanisms;
private Set attributes;
- private Function validator;
+ private Predicate validator;
/**
- * Constructs a UrlFinderBuilder
instance that can be used
- * to create UrlFinder
instances.
+ * Creates a {@link UrlFinderBuilder} instance.
*
- * @param urlPattern The pattern which will be used to find URLs
+ * @param urlPattern the pattern to use to find URLs
*/
public UrlFinderBuilder(final Pattern urlPattern) {
this(Arrays.asList(urlPattern));
}
/**
- * Constructs a UrlFinderBuilder
instance that can be used
- * to create UrlFinder
instances. It
+ * Creates a {@link UrlFinderBuilder} instance.
*
- * @param urlPatterns The list of patterns which will be used to find
- * URLs
+ * @param urlPatterns the list of patterns to use to find URLs
*/
public UrlFinderBuilder(final List urlPatterns) {
Validate.noNullElements(urlPatterns, "URL patterns cannot be null.");
@@ -145,24 +147,24 @@ public UrlFinderBuilder(final List urlPatterns) {
}
/**
- * Sets the locating mechanism used by the finder. Only elements matched
- * by the locator will be considered when searching for URLs.
+ * Sets the locating mechanism used by the finder. Only elements matched by the locator will
+ * be considered when searching for URLs.
*
- * @param locatingMechanism The By
locating mechanism
- * instance
- * @return The UrlFinderBuilder
instance
+ * @param locatingMechanism the By
locating mechanism instance
+ *
+ * @return the UrlFinderBuilder
instance
*/
public UrlFinderBuilder setLocatingMechanism(final By locatingMechanism) {
return setLocatingMechanisms(Arrays.asList(locatingMechanism));
}
/**
- * Sets the locating mechanisms used by the finder. Only elements
- * matched by the locators will be considered when searching for URLs.
+ * Sets the locating mechanisms used by the finder. Only elements matched by the locators
+ * will be considered when searching for URLs.
+ *
+ * @param locatingMechanisms the list of By
locating mechanism instances
*
- * @param locatingMechanisms The list of By
locating
- * mechanism instances
- * @return The UrlFinderBuilder
instance
+ * @return the UrlFinderBuilder
instance
*/
public UrlFinderBuilder setLocatingMechanisms(final List locatingMechanisms) {
Validate.noNullElements(locatingMechanisms, "Locating mechanisms cannot be null.");
@@ -172,10 +174,11 @@ public UrlFinderBuilder setLocatingMechanisms(final List locatingMechanisms)
}
/**
- * Sets which attributes to search for URLs.
+ * Sets the list of attribute names to search for URLs.
+ *
+ * @param attributes the list of attribute names
*
- * @param attributes The list of attribute names
- * @return The UrlFinderBuilder
instance
+ * @return the UrlFinderBuilder
instance
*/
public UrlFinderBuilder setAttributes(final List attributes) {
Validate.noNullElements(attributes, "Attributes cannot be null.");
@@ -185,22 +188,24 @@ public UrlFinderBuilder setAttributes(final List attributes) {
}
/**
- * Sets which attribute to search for URLs.
+ * Sets the attribute name to search for URLs.
*
- * @param attribute The name of the attribute
- * @return The UrlFinderBuilder
instance
+ * @param attribute the attribute name
+ *
+ * @return the UrlFinderBuilder
instance
*/
public UrlFinderBuilder setAttribute(final String attribute) {
return setAttributes(Arrays.asList(attribute));
}
/**
- * Sets a function to be used for validating found URLs.
+ * Sets a predicate to be used for validating found URLs.
+ *
+ * @param validator the validator predicate
*
- * @param validator The validator function
- * @return The UrlFinderBuilder
instance
+ * @return the UrlFinderBuilder
instance
*/
- public UrlFinderBuilder setValidator(final Function validator) {
+ public UrlFinderBuilder setValidator(final Predicate validator) {
Validate.notNull(validator, "The validator function cannot be null.");
this.validator = validator;
@@ -208,9 +213,9 @@ public UrlFinderBuilder setValidator(final Function validator)
}
/**
- * Builds the configured URL finder.
+ * Builds the configured UrlFinder
instance.
*
- * @return The configured UrlFinder
instance
+ * @return the configured UrlFinder
instance
*/
public UrlFinder build() {
return new UrlFinder(this);
@@ -219,14 +224,14 @@ public UrlFinder build() {
/**
* The default URL validator function.
*
- * @param url The URL to be validated
- * @return true
if the URL is valid, false
- * otherwise
+ * @param url the URL to validate
+ *
+ * @return true
if the URL is valid, false
otherwise
*/
private static boolean isValidUrl(final String url) {
try {
return InternetDomainName.isValid(URI.create(url).getHost());
- } catch (IllegalArgumentException e) {
+ } catch (IllegalArgumentException | NullPointerException exc) {
return false;
}
}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java b/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java
deleted file mode 100644
index cb6ae0b..0000000
--- a/src/main/java/com/github/peterbencze/serritor/internal/CallbackParameter.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright 2017 Peter Bencze.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.github.peterbencze.serritor.internal;
-
-import com.github.peterbencze.serritor.api.CrawlRequest;
-import java.net.URI;
-import java.util.Optional;
-
-/**
- * The base class from which all callback parameters inherit from.
- *
- * @author Peter Bencze
- */
-public abstract class CallbackParameter {
-
- private final int crawlDepth;
- private final URI refererUrl;
- private final CrawlRequest crawlRequest;
-
- protected CallbackParameter(final CallbackParameterBuilder builder) {
- crawlDepth = builder.crawlDepth;
- refererUrl = builder.refererUrl;
- crawlRequest = builder.crawlRequest;
- }
-
- /**
- * Returns the referer URL.
- *
- * @return The referer URL
- */
- public final Optional getRefererUrl() {
- return Optional.ofNullable(refererUrl);
- }
-
- /**
- * Returns the current crawl depth.
- *
- * @return The current crawl depth
- */
- public final int getCrawlDepth() {
- return crawlDepth;
- }
-
- /**
- * Returns the crawl request that was processed by the crawler.
- *
- * @return The processed CrawlRequest
instance
- */
- public final CrawlRequest getCrawlRequest() {
- return crawlRequest;
- }
-
- public static abstract class CallbackParameterBuilder {
-
- private final URI refererUrl;
- private final int crawlDepth;
- private final CrawlRequest crawlRequest;
-
- public CallbackParameterBuilder(final URI refererUrl, final int crawlDepth, final CrawlRequest crawlRequest) {
- this.refererUrl = refererUrl;
- this.crawlDepth = crawlDepth;
- this.crawlRequest = crawlRequest;
- }
-
- public abstract CallbackParameter build();
- }
-}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java
index 89bba42..3fec9fa 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlDomain.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2018 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package com.github.peterbencze.serritor.internal;
import com.google.common.collect.ImmutableList;
@@ -29,19 +30,20 @@ public final class CrawlDomain implements Serializable {
private final ImmutableList parts;
/**
- * Constructs a new CrawlDomain
instance.
+ * Creates a CrawlDomain
instance.
*
- * @param domain An immutable well-formed internet domain name
+ * @param domain an immutable well-formed internet domain name
*/
public CrawlDomain(final InternetDomainName domain) {
parts = domain.parts();
}
/**
- * Indicates if two CrawlDomain
instances are equal or not.
- * Crawl domains with the same domain name are considered equal.
+ * Indicates if two CrawlDomain
instances are equal. Crawl domains with the same
+ * domain name are considered equal.
+ *
+ * @param obj a CrawlDomain
instance
*
- * @param obj A CrawlDomain
instance
* @return true
if equal, false
otherwise
*/
@Override
@@ -59,10 +61,9 @@ public boolean equals(final Object obj) {
}
/**
- * Calculates the hash code from the individual components of the domain
- * name.
+ * Calculates the hash code from the individual components of the domain name.
*
- * @return The hash code for the crawl domain
+ * @return the hash code for the crawl domain
*/
@Override
public int hashCode() {
@@ -72,7 +73,8 @@ public int hashCode() {
/**
* Indicates if this crawl domain contains the specific internet domain.
*
- * @param domain An immutable well-formed internet domain name
+ * @param domain an immutable well-formed internet domain name
+ *
* @return true
if belongs, false
otherwise
*/
public boolean contains(final InternetDomainName domain) {
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
index bdcf569..d3fb6e0 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/CrawlFrontier.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2017 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,11 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package com.github.peterbencze.serritor.internal;
-import com.github.peterbencze.serritor.api.CrawlerConfiguration;
+import com.github.peterbencze.serritor.api.CrawlCandidate;
+import com.github.peterbencze.serritor.api.CrawlCandidate.CrawlCandidateBuilder;
import com.github.peterbencze.serritor.api.CrawlRequest;
-import com.github.peterbencze.serritor.internal.CrawlCandidate.CrawlCandidateBuilder;
+import com.github.peterbencze.serritor.api.CrawlerConfiguration;
import java.io.Serializable;
import java.net.URI;
import java.util.Arrays;
@@ -31,33 +33,28 @@
import org.apache.commons.codec.digest.DigestUtils;
/**
- * Provides an interface for the crawler to manage crawl requests while
- * crawling.
+ * Manages crawl requests and provides crawl candidates to the crawler.
*
* @author Peter Bencze
*/
public final class CrawlFrontier implements Serializable {
private final CrawlerConfiguration config;
-
- private final Set allowedCrawlDomains;
private final Set urlFingerprints;
-
private final Queue candidates;
private CrawlCandidate currentCandidate;
+ /**
+ * Creates a {@link CrawlFrontier} instance.
+ *
+ * @param config the crawler configuration
+ */
public CrawlFrontier(final CrawlerConfiguration config) {
this.config = config;
-
- allowedCrawlDomains = config.getAllowedCrawlDomains();
-
urlFingerprints = new HashSet<>();
-
- // Construct a priority queue according to the crawl strategy specified in the configuration
candidates = createPriorityQueue();
- // Feed initial crawl requests (seeds)
config.getCrawlSeeds()
.forEach((CrawlRequest request) -> {
feedRequest(request, true);
@@ -67,34 +64,28 @@ public CrawlFrontier(final CrawlerConfiguration config) {
/**
* Feeds a crawl request to the frontier.
*
- * @param request The CrawlRequest
instance to be fed
- * @param isCrawlSeed true
if the request is a crawl seed,
- * false
otherwise
+ * @param request the crawl request
+ * @param isCrawlSeed indicates if the request is a crawl seed
*/
public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) {
if (config.isOffsiteRequestFilteringEnabled()) {
- // Check if the request's domain is in the allowed crawl domains
-
boolean inCrawlDomain = false;
-
- for (CrawlDomain allowedCrawlDomain : allowedCrawlDomains) {
+
+ for (CrawlDomain allowedCrawlDomain : config.getAllowedCrawlDomains()) {
if (allowedCrawlDomain.contains(request.getDomain())) {
inCrawlDomain = true;
break;
}
}
-
+
if (!inCrawlDomain) {
return;
}
}
if (config.isDuplicateRequestFilteringEnabled()) {
- // Check if the URL has already been crawled
-
String urlFingerprint = createFingerprintForUrl(request.getRequestUrl());
-
if (urlFingerprints.contains(urlFingerprint)) {
return;
}
@@ -108,35 +99,33 @@ public void feedRequest(final CrawlRequest request, final boolean isCrawlSeed) {
int crawlDepthLimit = config.getMaximumCrawlDepth();
int nextCrawlDepth = currentCandidate.getCrawlDepth() + 1;
- // If a crawl depth limit is set, check if the candidate's crawl depth is less than or equal to the limit
if (crawlDepthLimit != 0 && nextCrawlDepth > crawlDepthLimit) {
return;
}
- builder = new CrawlCandidateBuilder(request).setRefererUrl(currentCandidate.getCandidateUrl())
+ builder = new CrawlCandidateBuilder(request)
+ .setRefererUrl(currentCandidate.getRequestUrl())
.setCrawlDepth(nextCrawlDepth);
} else {
builder = new CrawlCandidateBuilder(request);
}
- // Finally, add constructed candidate to the queue
candidates.add(builder.build());
}
/**
* Indicates if there are any candidates left in the queue.
*
- * @return true
if there are candidates in the queue,
- * false
otherwise
+ * @return true
if there are candidates in the queue, false
otherwise
*/
public boolean hasNextCandidate() {
return !candidates.isEmpty();
}
/**
- * Gets the next candidate from the queue.
+ * Returns the next crawl candidate from the queue.
*
- * @return The next CrawlCandidate
instance
+ * @return the next crawl candidate from the queue
*/
public CrawlCandidate getNextCandidate() {
currentCandidate = candidates.poll();
@@ -144,23 +133,21 @@ public CrawlCandidate getNextCandidate() {
}
/**
- * Creates the fingerprint of the given URL.
+ * Creates the fingerprint of the given URL. If the URL contains query parameters, it sorts
+ * them. This way URLs with different order of query parameters get the same fingerprint.
*
- * @param url The URL that the fingerprint will be created for
- * @return The fingerprint of the URL
+ * @param url the URL for which the fingerprint is created
+ *
+ * @return the fingerprint of the URL
*/
private static String createFingerprintForUrl(final URI url) {
- // First, we start off with the host only
StringBuilder truncatedUrl = new StringBuilder(url.getHost());
- // If there is a path in the URL, we append it after the host
String path = url.getPath();
- if (path != null && !"/".equals(path)) {
+ if (path != null) {
truncatedUrl.append(path);
}
- // If there are any query params, we sort and append them to what we got so far
- // This is required in order to detect already crawled URLs with different order of query params
String query = url.getQuery();
if (query != null) {
truncatedUrl.append("?");
@@ -173,24 +160,33 @@ private static String createFingerprintForUrl(final URI url) {
.forEachOrdered(truncatedUrl::append);
}
- // Finally, create the SHA-256 hash
return DigestUtils.sha256Hex(truncatedUrl.toString());
}
/**
- * Creates a new priority queue using the specified strategy.
+ * Creates a priority queue using the strategy specified in the configuration.
*
- * @return The PriorityQueue
instance for crawl requests using
- * the given comparator
+ * @return the priority queue using the strategy specified in the configuration
*/
+ @SuppressWarnings("checkstyle:MissingSwitchDefault")
private PriorityQueue createPriorityQueue() {
+ Function crawlDepthGetter
+ = (Function & Serializable) CrawlCandidate::getCrawlDepth;
+ Function priorityGetter
+ = (Function & Serializable) CrawlCandidate::getPriority;
+
switch (config.getCrawlStrategy()) {
case BREADTH_FIRST:
- return new PriorityQueue<>(Comparator.comparing((Function & Serializable) CrawlCandidate::getCrawlDepth)
- .thenComparing((Function & Serializable) CrawlCandidate::getPriority, Comparator.reverseOrder()));
+ Comparator breadthFirstComparator = Comparator.comparing(crawlDepthGetter)
+ .thenComparing(priorityGetter, Comparator.reverseOrder());
+
+ return new PriorityQueue<>(breadthFirstComparator);
case DEPTH_FIRST:
- return new PriorityQueue<>(Comparator.comparing((Function & Serializable) CrawlCandidate::getCrawlDepth, Comparator.reverseOrder())
- .thenComparing((Function & Serializable) CrawlCandidate::getPriority, Comparator.reverseOrder()));
+ Comparator depthFirstComparator
+ = Comparator.comparing(crawlDepthGetter, Comparator.reverseOrder())
+ .thenComparing(priorityGetter, Comparator.reverseOrder());
+
+ return new PriorityQueue<>(depthFirstComparator);
}
throw new IllegalArgumentException("Unsupported crawl strategy.");
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java b/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java
new file mode 100644
index 0000000..05e5898
--- /dev/null
+++ b/src/main/java/com/github/peterbencze/serritor/internal/EventObject.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2017 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.peterbencze.serritor.internal;
+
+import com.github.peterbencze.serritor.api.CrawlCandidate;
+
+/**
+ * Base class from which all event objects shall be derived.
+ *
+ * @author Peter Bencze
+ */
+public abstract class EventObject {
+
+ private final CrawlCandidate crawlCandidate;
+
+ /**
+ * Base constructor of all event objects.
+ *
+ * @param crawlCandidate the current crawl candidate
+ */
+ protected EventObject(final CrawlCandidate crawlCandidate) {
+ this.crawlCandidate = crawlCandidate;
+ }
+
+ /**
+ * Returns the current crawl candidate.
+ *
+ * @return the current crawl candidate
+ */
+ public final CrawlCandidate getCrawlCandidate() {
+ return crawlCandidate;
+ }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java
similarity index 54%
rename from src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java
rename to src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java
index dfedfdb..13e3484 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanism.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanism.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2018 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,33 +13,39 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.github.peterbencze.serritor.internal;
+
+package com.github.peterbencze.serritor.internal.crawldelaymechanism;
import com.github.peterbencze.serritor.api.CrawlerConfiguration;
import org.openqa.selenium.JavascriptExecutor;
/**
- * A crawl delay mechanism, in which case the delay corresponds to the page
- * loading time, if it is between the specified range, otherwise the minimum or
- * maximum duration is used.
+ * A crawl delay mechanism, in which case the delay corresponds to the page loading time, if it is
+ * between the specified range, otherwise the minimum or maximum duration is used.
*
* @author Peter Bencze
*/
public final class AdaptiveCrawlDelayMechanism implements CrawlDelayMechanism {
+ private static final String BROWSER_COMPATIBILITY_JS = "return ('performance' in window) && "
+ + "('timing' in window.performance)";
+ private static final String DELAY_CALCULATION_JS = "return performance.timing.loadEventEnd - "
+ + "performance.timing.navigationStart;";
+
private final long minDelayInMillis;
private final long maxDelayInMillis;
private final JavascriptExecutor jsExecutor;
/**
- * Constructs a new AdaptiveCrawlDelayMechanism
instance.
+ * Creates an {@link AdaptiveCrawlDelayMechanism} instance.
*
- * @param config The CrawlerConfiguration
instance which
- * specifies the minimum and maximum delay.
- * @param jsExecutor The WebDriver
instance which is capable of
- * executing JavaScript.
+ * @param config the crawler configuration which specifies the minimum and maximum delay
+ * @param jsExecutor the {@link org.openqa.selenium.WebDriver} instance which is capable of
+ * executing JavaScript
*/
- public AdaptiveCrawlDelayMechanism(final CrawlerConfiguration config, final JavascriptExecutor jsExecutor) {
+ public AdaptiveCrawlDelayMechanism(
+ final CrawlerConfiguration config,
+ final JavascriptExecutor jsExecutor) {
minDelayInMillis = config.getMinimumCrawlDelayDurationInMillis();
maxDelayInMillis = config.getMaximumCrawlDelayDurationInMillis();
this.jsExecutor = jsExecutor;
@@ -48,24 +54,22 @@ public AdaptiveCrawlDelayMechanism(final CrawlerConfiguration config, final Java
/**
* Checks if the browser supports the Navigation Timing API.
*
- * @return true
if the browser is compatible,
- * false
otherwise
+ * @return true
if the browser is compatible, false
otherwise
*/
public boolean isBrowserCompatible() {
- return (boolean) jsExecutor.executeScript("return ('performance' in window) && ('timing' in window.performance)");
+ return (boolean) jsExecutor.executeScript(BROWSER_COMPATIBILITY_JS);
}
/**
- * Calculates the page loading time and returns the delay accordingly,
- * between the specified min-max range. If the calculated delay is smaller
- * than the minimum, it returns the minimum delay. If the calculated delay
- * is higher than the maximum, it returns the maximum delay.
+ * Calculates the page loading time and returns the delay accordingly, between the specified
+ * min-max range. If the calculated delay is smaller than the minimum, it returns the minimum
+ * delay. If the calculated delay is higher than the maximum, it returns the maximum delay.
*
- * @return The delay in milliseconds
+ * @return the delay in milliseconds
*/
@Override
public long getDelay() {
- long delayInMillis = (long) jsExecutor.executeScript("return performance.timing.loadEventEnd - performance.timing.navigationStart;");
+ long delayInMillis = (long) jsExecutor.executeScript(DELAY_CALCULATION_JS);
if (delayInMillis < minDelayInMillis) {
return minDelayInMillis;
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java
similarity index 71%
rename from src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java
rename to src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java
index 34317b1..4f1d34d 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/CrawlDelayMechanism.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/CrawlDelayMechanism.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2018 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,19 +13,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.github.peterbencze.serritor.internal;
+
+package com.github.peterbencze.serritor.internal.crawldelaymechanism;
/**
- * An interface that every crawl delay mechanism should implement.
- *
+ * An interface which should be implemented by every crawl delay mechanism.
+ *
* @author Peter Bencze
*/
public interface CrawlDelayMechanism {
-
+
/**
- * Returns the delay that should pass between each request.
- *
- * @return The duration of delay in milliseconds
+ * Returns the delay which should pass between each request.
+ *
+ * @return the duration of delay in milliseconds
*/
long getDelay();
}
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java
similarity index 77%
rename from src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java
rename to src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java
index a3f84c8..9713f8b 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanism.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanism.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2018 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,13 +13,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.github.peterbencze.serritor.internal;
+
+package com.github.peterbencze.serritor.internal.crawldelaymechanism;
import com.github.peterbencze.serritor.api.CrawlerConfiguration;
/**
- * A crawl delay mechanism, in which case the delay is constant and equals to
- * the duration specified in the configuration.
+ * A crawl delay mechanism, in which case the delay is constant and equals to the duration specified
+ * in the configuration.
*
* @author Peter Bencze
*/
@@ -28,10 +29,9 @@ public final class FixedCrawlDelayMechanism implements CrawlDelayMechanism {
private final long delayInMillis;
/**
- * Constructs a new FixedCrawlDelayMechanism
instance.
+ * Creates a {@link FixedCrawlDelayMechanism} instance.
*
- * @param config The CrawlerConfiguration
instance which
- * specifies the fixed delay duration.
+ * @param config the crawler configuration which specifies the fixed delay duration
*/
public FixedCrawlDelayMechanism(final CrawlerConfiguration config) {
this.delayInMillis = config.getFixedCrawlDelayDurationInMillis();
@@ -40,7 +40,7 @@ public FixedCrawlDelayMechanism(final CrawlerConfiguration config) {
/**
* Returns the fixed delay specified in the configuration.
*
- * @return The delay in milliseconds
+ * @return the delay in milliseconds
*/
@Override
public long getDelay() {
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java
similarity index 78%
rename from src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java
rename to src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java
index f8a7446..a457da3 100644
--- a/src/main/java/com/github/peterbencze/serritor/internal/RandomCrawlDelayMechanism.java
+++ b/src/main/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/RandomCrawlDelayMechanism.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2018 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,14 +13,15 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.github.peterbencze.serritor.internal;
+
+package com.github.peterbencze.serritor.internal.crawldelaymechanism;
import com.github.peterbencze.serritor.api.CrawlerConfiguration;
import java.util.concurrent.ThreadLocalRandom;
/**
- * A crawl delay mechanism in which case the duration is randomized between the
- * specified minimum and maximum range.
+ * A crawl delay mechanism in which case the duration is randomized between the specified minimum
+ * and maximum range.
*
* @author Peter Bencze
*/
@@ -30,10 +31,9 @@ public final class RandomCrawlDelayMechanism implements CrawlDelayMechanism {
private final long upperLimit;
/**
- * Constructs a new RandomCrawlDelayMechanism
instance.
+ * Creates a {@link RandomCrawlDelayMechanism} instance.
*
- * @param config The CrawlerConfiguration
instance which
- * specifies the minimum and maximum delay.
+ * @param config the crawler configuration which specifies the minimum and maximum delay.
*/
public RandomCrawlDelayMechanism(final CrawlerConfiguration config) {
lowerLimit = config.getMinimumCrawlDelayDurationInMillis();
@@ -41,10 +41,9 @@ public RandomCrawlDelayMechanism(final CrawlerConfiguration config) {
}
/**
- * Returns a random delay between the minimum and maximum range specified in
- * the configuration.
+ * Returns a random delay between the minimum and maximum range specified in the configuration.
*
- * @return The delay in milliseconds
+ * @return the delay in milliseconds
*/
@Override
public long getDelay() {
diff --git a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java
index 86e5fa6..d33da36 100644
--- a/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/api/helper/UrlFinderTest.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2018 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,9 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package com.github.peterbencze.serritor.api.helper;
-import com.github.peterbencze.serritor.api.HtmlResponse;
+import com.github.peterbencze.serritor.api.event.PageLoadEvent;
import com.github.peterbencze.serritor.api.helper.UrlFinder.UrlFinderBuilder;
import java.util.Arrays;
import java.util.List;
@@ -29,55 +30,57 @@
import org.openqa.selenium.WebElement;
/**
- * Test cases for UrlFinder
.
+ * Test cases for {@link UrlFinder}.
*
* @author Peter Bencze
*/
public final class UrlFinderTest {
-
+
private static final Pattern URL_PATTERN = Pattern.compile(".+valid-url.+");
private static final String ATTRIBUTE = "href";
private static final String TAG_NAME = "a";
private static final String VALID_URL = "http://valid-url.com";
private static final String INVALID_URL = "invalid-url";
private static final String URL_WITH_INVALID_DOMAIN = "http://invalid.domain";
-
+
private WebDriver mockedDriver;
- private HtmlResponse mockedResponse;
+ private PageLoadEvent mockedEvent;
private WebElement mockedElementWithValidUrl;
private WebElement mockedElementWithInvalidUrlFormat;
- private WebElement mockedElementWithInvalidDomain;
+ private WebElement mockedElementWithInvalidDomain;
private UrlFinder urlFinder;
@Before
public void initialize() {
- mockedResponse = Mockito.mock(HtmlResponse.class);
-
+ mockedEvent = Mockito.mock(PageLoadEvent.class);
+
mockedDriver = Mockito.mock(WebDriver.class);
- Mockito.when(mockedResponse.getWebDriver())
- .thenReturn(mockedDriver);
-
+ Mockito.when(mockedEvent.getWebDriver())
+ .thenReturn(mockedDriver);
+
mockedElementWithValidUrl = Mockito.mock(WebElement.class);
Mockito.when(mockedElementWithValidUrl.getAttribute(Mockito.eq(ATTRIBUTE)))
- .thenReturn(VALID_URL);
-
+ .thenReturn(VALID_URL);
+
mockedElementWithInvalidUrlFormat = Mockito.mock(WebElement.class);
Mockito.when(mockedElementWithInvalidUrlFormat.getAttribute(Mockito.eq(ATTRIBUTE)))
- .thenReturn(INVALID_URL);
-
+ .thenReturn(INVALID_URL);
+
mockedElementWithInvalidDomain = Mockito.mock(WebElement.class);
Mockito.when(mockedElementWithInvalidDomain.getAttribute(Mockito.eq(ATTRIBUTE)))
.thenReturn(URL_WITH_INVALID_DOMAIN);
- List elementList = Arrays.asList(mockedElementWithValidUrl, mockedElementWithInvalidUrlFormat, mockedElementWithInvalidDomain);
+ List elementList
+ = Arrays.asList(mockedElementWithValidUrl, mockedElementWithInvalidUrlFormat,
+ mockedElementWithInvalidDomain);
Mockito.when(mockedDriver.findElements(By.tagName(TAG_NAME)))
.thenReturn(elementList);
-
+
urlFinder = new UrlFinderBuilder(URL_PATTERN).build();
}
@Test
- public void testFindUrlsInResponse() {
- Assert.assertEquals(Arrays.asList(VALID_URL), urlFinder.findUrlsInResponse(mockedResponse));
+ public void testFindUrlsInPage() {
+ Assert.assertEquals(Arrays.asList(VALID_URL), urlFinder.findUrlsInPage(mockedEvent));
}
}
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java
index 8226d10..4bdb829 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlDomainTest.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2018 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package com.github.peterbencze.serritor.internal;
import com.google.common.net.InternetDomainName;
@@ -20,47 +21,37 @@
import org.junit.Test;
/**
- * Test cases for CrawlDomain
.
- *
+ * Test cases for {@link CrawlDomain}.
+ *
* @author Peter Bencze
*/
public final class CrawlDomainTest {
-
+
private static final InternetDomainName DOMAIN = InternetDomainName.from("test.com");
private static final InternetDomainName SUBDOMAIN = InternetDomainName.from("sub.test.com");
-
+
private static final int DOMAIN_PARTS_HASHCODE = DOMAIN.parts().hashCode();
-
+
private static final CrawlDomain CRAWL_DOMAIN_0 = new CrawlDomain(DOMAIN);
private static final CrawlDomain CRAWL_DOMAIN_1 = new CrawlDomain(DOMAIN);
private static final CrawlDomain CRAWL_DOMAIN_2 = new CrawlDomain(SUBDOMAIN);
-
+
@Test
public void testEquals() {
- // A crawl domain should be equal with itself
Assert.assertEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_0);
-
- // Crawl domains with the same domain should be equal
Assert.assertEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_1);
-
- // Crawl domains with different domains should not be equal
Assert.assertNotEquals(CRAWL_DOMAIN_0, CRAWL_DOMAIN_2);
}
-
+
@Test
public void testHashCode() {
Assert.assertEquals(DOMAIN_PARTS_HASHCODE, CRAWL_DOMAIN_0.hashCode());
}
-
+
@Test
public void testContains() {
- // A crawl domain should contain its own domain
Assert.assertTrue(CRAWL_DOMAIN_0.contains(DOMAIN));
-
- // A crawl domain should contain its own domain's subdomain
Assert.assertTrue(CRAWL_DOMAIN_0.contains(SUBDOMAIN));
-
- // A crawl domain should not contain a domain different from its own domain
Assert.assertFalse(CRAWL_DOMAIN_2.contains(DOMAIN));
}
}
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
index 79c5131..6ddf172 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/CrawlFrontierTest.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2017 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,12 +13,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package com.github.peterbencze.serritor.internal;
-import com.github.peterbencze.serritor.api.CrawlerConfiguration;
+import com.github.peterbencze.serritor.api.CrawlCandidate;
import com.github.peterbencze.serritor.api.CrawlRequest;
import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder;
import com.github.peterbencze.serritor.api.CrawlStrategy;
+import com.github.peterbencze.serritor.api.CrawlerConfiguration;
import com.github.peterbencze.serritor.api.CrawlerConfiguration.CrawlerConfigurationBuilder;
import java.net.URI;
import java.util.Arrays;
@@ -30,7 +32,7 @@
import org.mockito.Mockito;
/**
- * Test cases for CrawlFrontier
.
+ * Test cases for {@link CrawlFrontier}.
*
* @author Peter Bencze
*/
@@ -39,11 +41,14 @@ public final class CrawlFrontierTest {
// Allowed crawl domains
private static final String ALLOWED_CRAWL_DOMAIN_0 = "root-url-0.com";
private static final String ALLOWED_CRAWL_DOMAIN_1 = "root-url-1.com";
- private static final List ALLOWED_CRAWL_DOMAINS = Arrays.asList(ALLOWED_CRAWL_DOMAIN_0, ALLOWED_CRAWL_DOMAIN_1);
+ private static final List ALLOWED_CRAWL_DOMAINS
+ = Arrays.asList(ALLOWED_CRAWL_DOMAIN_0, ALLOWED_CRAWL_DOMAIN_1);
// Root URLs
- private static final URI ROOT_URL_0 = URI.create("http://root-url-0.com?param1=foo¶m2=bar#fragment");
- private static final URI DUPLICATE_ROOT_URL_0 = URI.create("https://root-url-0.com?param2=bar¶m1=foo");
+ private static final URI ROOT_URL_0
+ = URI.create("http://root-url-0.com?param1=foo¶m2=bar#fragment");
+ private static final URI DUPLICATE_ROOT_URL_0
+ = URI.create("https://root-url-0.com?param2=bar¶m1=foo");
private static final URI ROOT_URL_1 = URI.create("http://root-url-1.com");
// Root URL crawl depth
@@ -54,18 +59,25 @@ public final class CrawlFrontierTest {
private static final int ROOT_URL_1_PRIORITY = 1;
// Root URL crawl requests
- private static final CrawlRequest ROOT_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_0).setPriority(ROOT_URL_0_PRIORITY).build();
- private static final CrawlRequest DUPLICATE_ROOT_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(DUPLICATE_ROOT_URL_0).build();
- private static final CrawlRequest ROOT_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(ROOT_URL_1).setPriority(ROOT_URL_1_PRIORITY).build();
- private static final List CRAWL_SEEDS = Arrays.asList(ROOT_URL_0_CRAWL_REQUEST, ROOT_URL_1_CRAWL_REQUEST);
+ private static final CrawlRequest ROOT_URL_0_CRAWL_REQUEST
+ = new CrawlRequestBuilder(ROOT_URL_0).setPriority(ROOT_URL_0_PRIORITY).build();
+ private static final CrawlRequest DUPLICATE_ROOT_URL_0_CRAWL_REQUEST
+ = new CrawlRequestBuilder(DUPLICATE_ROOT_URL_0).build();
+ private static final CrawlRequest ROOT_URL_1_CRAWL_REQUEST
+ = new CrawlRequestBuilder(ROOT_URL_1).setPriority(ROOT_URL_1_PRIORITY).build();
+ private static final List CRAWL_SEEDS
+ = Arrays.asList(ROOT_URL_0_CRAWL_REQUEST, ROOT_URL_1_CRAWL_REQUEST);
// Child URL path
private static final String CHILD_URL_PATH = "/child";
// Child URLs
- private static final URI CHILD_URL_0 = URI.create(String.format("http://root-url-0.com%s-0", CHILD_URL_PATH));
- private static final URI CHILD_URL_1 = URI.create(String.format("http://root-url-0.com%s-1", CHILD_URL_PATH));
- private static final URI CHILD_URL_2 = URI.create(String.format("http://root-url-1.com%s-0", CHILD_URL_PATH));
+ private static final URI CHILD_URL_0
+ = URI.create(String.format("http://root-url-0.com%s-0", CHILD_URL_PATH));
+ private static final URI CHILD_URL_1
+ = URI.create(String.format("http://root-url-0.com%s-1", CHILD_URL_PATH));
+ private static final URI CHILD_URL_2
+ = URI.create(String.format("http://root-url-1.com%s-0", CHILD_URL_PATH));
// Child URL crawl depth
private static final int CHILD_URL_CRAWL_DEPTH = 1;
@@ -75,10 +87,13 @@ public final class CrawlFrontierTest {
private static final int CHILD_URL_1_PRIORITY = CHILD_URL_0_PRIORITY;
private static final int CHILD_URL_2_PRIORITY = 1;
- // Child URL crawl requests
- private static final CrawlRequest CHILD_URL_0_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_0).setPriority(CHILD_URL_0_PRIORITY).build();
- private static final CrawlRequest CHILD_URL_1_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_1).setPriority(CHILD_URL_1_PRIORITY).build();
- private static final CrawlRequest CHILD_URL_2_CRAWL_REQUEST = new CrawlRequestBuilder(CHILD_URL_2).setPriority(CHILD_URL_2_PRIORITY).build();
+ // Child URL crawl requests
+ private static final CrawlRequest CHILD_URL_0_CRAWL_REQUEST
+ = new CrawlRequestBuilder(CHILD_URL_0).setPriority(CHILD_URL_0_PRIORITY).build();
+ private static final CrawlRequest CHILD_URL_1_CRAWL_REQUEST
+ = new CrawlRequestBuilder(CHILD_URL_1).setPriority(CHILD_URL_1_PRIORITY).build();
+ private static final CrawlRequest CHILD_URL_2_CRAWL_REQUEST
+ = new CrawlRequestBuilder(CHILD_URL_2).setPriority(CHILD_URL_2_PRIORITY).build();
// Offsite URL
private static final URI OFFSITE_URL = URI.create("http://offsite-url.com");
@@ -87,7 +102,8 @@ public final class CrawlFrontierTest {
private static final int OFFSITE_URL_PRIORITY = 0;
// Offsite URL crawl request
- private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST = new CrawlRequestBuilder(OFFSITE_URL).setPriority(OFFSITE_URL_PRIORITY).build();
+ private static final CrawlRequest OFFSITE_URL_CRAWL_REQUEST
+ = new CrawlRequestBuilder(OFFSITE_URL).setPriority(OFFSITE_URL_PRIORITY).build();
// Max crawl depth
private static final int MAX_CRAWL_DEPTH = 1;
@@ -107,38 +123,27 @@ public void initialize() {
@Test
public void testHasNextCandidateWithCandidatesInQueue() {
- // Check if there are any candidates in the queue, the method should return true
Assert.assertTrue(frontier.hasNextCandidate());
- // Get the next candidate from the queue
frontier.getNextCandidate();
- // Check if there are any candidates in the queue, the method should return true again
Assert.assertTrue(frontier.hasNextCandidate());
- // Get the next candidate from the queue
frontier.getNextCandidate();
- // Check if there are any candidates in the queue, the method should return false at this point
Assert.assertFalse(frontier.hasNextCandidate());
- // Feed child crawl requests
frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false);
frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false);
- // Check if there are any candidates in the queue, the method should return true
Assert.assertTrue(frontier.hasNextCandidate());
- // Get the next candidate from the queue
frontier.getNextCandidate();
- // Check if there are any candidates in the queue, the method should return true once again
Assert.assertTrue(frontier.hasNextCandidate());
- // Get the next candidate from the queue
frontier.getNextCandidate();
- // Finally, check if there are any candidates in the queue, the method should return false at this point
Assert.assertFalse(frontier.hasNextCandidate());
}
@@ -147,10 +152,9 @@ public void testHasNextCandidateWithEmptyQueue() {
Mockito.when(config.getCrawlSeeds())
.thenReturn(Collections.EMPTY_SET);
- // Create frontier without any crawl seeds
+ // Create crawl frontier without crawl seeds
frontier = new CrawlFrontier(config);
- // Check if there are any candidates in the queue, the method should return false
Assert.assertFalse(frontier.hasNextCandidate());
}
@@ -158,10 +162,8 @@ public void testHasNextCandidateWithEmptyQueue() {
public void testEnabledDuplicateRequestFiltering() {
clearCrawlCandidateQueue();
- // Feed a duplicate crawl request
frontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, false);
- // Check if the candidate was added to the queue, the method should return false
Assert.assertFalse(frontier.hasNextCandidate());
}
@@ -171,27 +173,20 @@ public void testDisabledDuplicateRequestFiltering() {
Mockito.when(config.isDuplicateRequestFilteringEnabled())
.thenReturn(false);
- // Clear the crawl candidate queue of the frontier
clearCrawlCandidateQueue();
- // Feed a duplicate crawl request
frontier.feedRequest(DUPLICATE_ROOT_URL_0_CRAWL_REQUEST, true);
- // Check if the candidates was added to the queue, the method should return true
Assert.assertTrue(frontier.hasNextCandidate());
-
- // Check if the URLs match
- Assert.assertEquals(DUPLICATE_ROOT_URL_0, frontier.getNextCandidate().getCandidateUrl());
+ Assert.assertEquals(DUPLICATE_ROOT_URL_0, frontier.getNextCandidate().getRequestUrl());
}
@Test
public void testEnabledOffsiteRequestFiltering() {
clearCrawlCandidateQueue();
- // Feed an offsite request
frontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, false);
- // Check if the candidate was added to the queue, the method should return false
Assert.assertFalse(frontier.hasNextCandidate());
}
@@ -201,90 +196,53 @@ public void testDisabledOffsiteRequestFiltering() {
Mockito.when(config.isOffsiteRequestFilteringEnabled())
.thenReturn(false);
- // Clear the crawl candidate queue of the frontier
clearCrawlCandidateQueue();
- // Feed an offsite request
frontier.feedRequest(OFFSITE_URL_CRAWL_REQUEST, false);
- // Check if the candidates was added to the queue, the method should return true
Assert.assertTrue(frontier.hasNextCandidate());
-
- // Check if the URLs match
- Assert.assertEquals(OFFSITE_URL.toString(), frontier.getNextCandidate().getCandidateUrl().toString());
+ Assert.assertEquals(OFFSITE_URL.toString(),
+ frontier.getNextCandidate().getRequestUrl().toString());
}
@Test
public void testGetNextCandidateUsingBreadthFirstCrawlStrategy() {
- // Get the crawl candidate of root URL 1.
CrawlCandidate nextCandidate = frontier.getNextCandidate();
- // Check the URL of this candidate, it should be root URL 1.
- Assert.assertEquals(ROOT_URL_1, nextCandidate.getCandidateUrl());
-
- // Check the crawl depth of this candidate, it should be 0 because it is a root URL.
+ Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl());
Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
-
- // Check the priority of this candidate, it should be 1.
Assert.assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority());
- // Feed a child request that come from root URL 1.
frontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false);
- // Get the crawl candidate of root URL 0.
nextCandidate = frontier.getNextCandidate();
- // Check the URL of this candidate, it should be root URL 0.
- Assert.assertEquals(ROOT_URL_0, nextCandidate.getCandidateUrl());
-
- // Check the crawl depth of this candidate, it should be 0 again because it is also a root URL.
+ Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl());
Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
-
- // Check the priority of this candidate, it should be 0.
Assert.assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority());
- // Feed 2 child requests that come from root URL 0.
frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false);
frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false);
- // Get the crawl candidate of child URL 2.
nextCandidate = frontier.getNextCandidate();
- // Check the URL of this candidate, it should be child URL 2.
- Assert.assertEquals(CHILD_URL_2.toString(), nextCandidate.getCandidateUrl().toString());
-
- // Check the crawl depth of this candidate, it should be 1 because it is a child URL that comes from root URL 1.
+ Assert.assertEquals(CHILD_URL_2.toString(), nextCandidate.getRequestUrl().toString());
Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
-
- // Check the priority of this candidate, it should be 1.
Assert.assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority());
- // Get the crawl candidate of a child URL.
- // Note: a priority queue does not ensure FIFO order when elements have the same depth and priority
+ // a priority queue doesn't ensure FIFO order when elements have the same depth and priority
nextCandidate = frontier.getNextCandidate();
- // Check the URL of this request, it should be a child URL.
- Assert.assertTrue(nextCandidate.getCandidateUrl().toString().contains(CHILD_URL_PATH));
-
- // Check the crawl depth of this candidate, it should be 1 again because it is a child URL that comes from root URL 0.
+ Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH));
Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
- // Get the priority of this candidate
int previousChildCandidatePriority = nextCandidate.getPriority();
- // Get the crawl candidate of the next child URL.
nextCandidate = frontier.getNextCandidate();
- // Check the URL of this candidate, it should be a child URL.
- Assert.assertTrue(nextCandidate.getCandidateUrl().toString().contains(CHILD_URL_PATH));
-
- // Check the crawl depth of this candidate, it should be 1 again becaise it is another child URL that also comes from root URL 0.
+ Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH));
Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
-
- // Compare the priority of this candidate to the previous candidate's priority.
Assert.assertEquals(previousChildCandidatePriority, nextCandidate.getPriority());
-
- // There should be no more candidates left at this point.
Assert.assertFalse(frontier.hasNextCandidate());
}
@@ -296,75 +254,41 @@ public void testGetNextCandidateUsingDepthFirstCrawlStrategy() {
// Create frontier with depth-first crawl strategy
frontier = new CrawlFrontier(config);
- // Get the crawl candidate of root URL 1
CrawlCandidate nextCandidate = frontier.getNextCandidate();
- // Check the URL of this candidate, it should be root URL 1
- Assert.assertEquals(ROOT_URL_1, nextCandidate.getCandidateUrl());
-
- // Check the crawl depth of this candidate, it should be 0 because it is a root URL
+ Assert.assertEquals(ROOT_URL_1, nextCandidate.getRequestUrl());
Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
-
- // Check the priority of this candidate, it should be 1
Assert.assertEquals(ROOT_URL_1_PRIORITY, nextCandidate.getPriority());
- // Feed a child request that comes from root URL 1
frontier.feedRequest(CHILD_URL_2_CRAWL_REQUEST, false);
- // Get the crawl candidate of a child URL
- // Note: a priority queue does not ensure FIFO order when elements have the same depth and priority
+ // a priority queue doesn't ensure FIFO order when elements have the same depth and priority
nextCandidate = frontier.getNextCandidate();
- // Check the URL of this candidate, it should be a child URL
- Assert.assertTrue(nextCandidate.getCandidateUrl().toString().contains(CHILD_URL_PATH));
-
- // Check the crawl depth of this candidate, it should be 1 because it is a child URL that comes from root URL 1
+ Assert.assertTrue(nextCandidate.getRequestUrl().toString().contains(CHILD_URL_PATH));
Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
-
- // Check the priority of this candidate, it should be 1
Assert.assertEquals(CHILD_URL_2_PRIORITY, nextCandidate.getPriority());
- // Get the crawl candidate of root URL 0.
nextCandidate = frontier.getNextCandidate();
- // Check the URL of this candidate, it should be root URL 0
- Assert.assertEquals(ROOT_URL_0, nextCandidate.getCandidateUrl());
-
- // Check the crawl depth of this candidate, it should be 0 again because it is also a root URL
+ Assert.assertEquals(ROOT_URL_0, nextCandidate.getRequestUrl());
Assert.assertEquals(ROOT_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
-
- // Check the priority of this candidate, it should be 0
Assert.assertEquals(ROOT_URL_0_PRIORITY, nextCandidate.getPriority());
- // Feed 2 child requests that come from root URL 0
frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false);
frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false);
- // Get the crawl candidate of child URL 0
nextCandidate = frontier.getNextCandidate();
- // Check the URL of this candidate, it should be child URL 0
- Assert.assertEquals(CHILD_URL_0.toString(), nextCandidate.getCandidateUrl().toString());
-
- // Check the crawl depth of this candidate, it should be 1 again because it is a child URL that comes from root URL 0
+ Assert.assertEquals(CHILD_URL_0.toString(), nextCandidate.getRequestUrl().toString());
Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
-
- // Check the priority of this candidate, it should be 0
Assert.assertEquals(CHILD_URL_0_PRIORITY, nextCandidate.getPriority());
- // Get the crawl candidate of child URL 1
nextCandidate = frontier.getNextCandidate();
- // Check the URL of this candidate, it should be child URL 1
- Assert.assertEquals(CHILD_URL_1.toString(), nextCandidate.getCandidateUrl().toString());
-
- // Check the crawl depth of this candidate, it should be 1 again becaise it is a child URL that also comes from root URL 0
+ Assert.assertEquals(CHILD_URL_1.toString(), nextCandidate.getRequestUrl().toString());
Assert.assertEquals(CHILD_URL_CRAWL_DEPTH, nextCandidate.getCrawlDepth());
-
- // Check the priority of this candidate, it should be 0
Assert.assertEquals(CHILD_URL_1_PRIORITY, nextCandidate.getPriority());
-
- // There should be no more candidates left at this point
Assert.assertFalse(frontier.hasNextCandidate());
}
@@ -373,27 +297,20 @@ public void testCrawlDepthLimitation() {
Mockito.when(config.getMaximumCrawlDepth())
.thenReturn(MAX_CRAWL_DEPTH);
- // Clear the crawl candidate queue of the frontier
clearCrawlCandidateQueue();
- // Feed a child request, its crawl depth will be 1
frontier.feedRequest(CHILD_URL_0_CRAWL_REQUEST, false);
- // Get the crawl candidate of the previously added child URL
CrawlCandidate nextCandidate = frontier.getNextCandidate();
- // Check its crawl depth, it should be less than or equal to the limit
Assert.assertTrue(nextCandidate.getCrawlDepth() <= MAX_CRAWL_DEPTH);
- // Feed another child request, its crawl depth will be 2 which is above the limit
frontier.feedRequest(CHILD_URL_1_CRAWL_REQUEST, false);
- // There should be no more candidates at this point
Assert.assertFalse(frontier.hasNextCandidate());
}
private void clearCrawlCandidateQueue() {
- // Loop until there are no remaining candidates in the queue
while (frontier.hasNextCandidate()) {
frontier.getNextCandidate();
}
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanismTest.java b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java
similarity index 78%
rename from src/test/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanismTest.java
rename to src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java
index 60d5b3e..166df00 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/AdaptiveCrawlDelayMechanismTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/AdaptiveCrawlDelayMechanismTest.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2018 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.github.peterbencze.serritor.internal;
+package com.github.peterbencze.serritor.internal.crawldelaymechanism;
import com.github.peterbencze.serritor.api.CrawlerConfiguration;
import java.time.Duration;
@@ -25,62 +25,58 @@
import org.openqa.selenium.JavascriptExecutor;
/**
- * Test cases for AdaptiveCrawlDelayMechanism
.
- *
+ * Test cases for {@link AdaptiveCrawlDelayMechanism}.
+ *
* @author Peter Bencze
*/
public final class AdaptiveCrawlDelayMechanismTest {
-
+
private static final long LOWER_DELAY_DURATION_IN_MILLIS = Duration.ZERO.toMillis();
- private static final long MINIMUM_DELAY_DURATION_IN_MILLIS = Duration.ofSeconds(1).toMillis();
+ private static final long MINIMUM_DELAY_DURATION_IN_MILLIS = Duration.ofSeconds(1).toMillis();
private static final long IN_RANGE_DELAY_DURATION_IN_MILLIS = Duration.ofSeconds(2).toMillis();
private static final long MAXIMUM_DELAY_DURATION_IN_MILLIS = Duration.ofSeconds(3).toMillis();
private static final long HIGHER_DELAY_DURATION_IN_MILLIS = Duration.ofSeconds(4).toMillis();
-
+
private CrawlerConfiguration mockedConfig;
- private JavascriptExecutor mockedJsExecutor;
+ private JavascriptExecutor mockedJsExecutor;
private AdaptiveCrawlDelayMechanism crawlDelayMechanism;
-
+
@Before
public void initialize() {
mockedConfig = Mockito.mock(CrawlerConfiguration.class);
Mockito.when(mockedConfig.getMinimumCrawlDelayDurationInMillis())
- .thenReturn(MINIMUM_DELAY_DURATION_IN_MILLIS);
+ .thenReturn(MINIMUM_DELAY_DURATION_IN_MILLIS);
Mockito.when(mockedConfig.getMaximumCrawlDelayDurationInMillis())
.thenReturn(MAXIMUM_DELAY_DURATION_IN_MILLIS);
-
+
mockedJsExecutor = Mockito.mock(JavascriptExecutor.class);
-
+
crawlDelayMechanism = new AdaptiveCrawlDelayMechanism(mockedConfig, mockedJsExecutor);
}
-
+
@Test
public void testDelayLowerThanMinimum() {
- // Return a delay which is lower than the predefined minimum
Mockito.when(mockedJsExecutor.executeScript(Mockito.anyString()))
.thenReturn(LOWER_DELAY_DURATION_IN_MILLIS);
-
- // The minimum delay should be returned
- Assert.assertEquals(mockedConfig.getMinimumCrawlDelayDurationInMillis(), crawlDelayMechanism.getDelay());
+
+ Assert.assertEquals(mockedConfig.getMinimumCrawlDelayDurationInMillis(),
+ crawlDelayMechanism.getDelay());
}
-
+
@Test
public void testDelayHigherThanMaximum() {
- // Return a delay which is higher than the predefined maximum
Mockito.when(mockedJsExecutor.executeScript(Mockito.anyString()))
.thenReturn(HIGHER_DELAY_DURATION_IN_MILLIS);
-
- // The maximum delay should be returned
- Assert.assertEquals(mockedConfig.getMaximumCrawlDelayDurationInMillis(), crawlDelayMechanism.getDelay());
+
+ Assert.assertEquals(mockedConfig.getMaximumCrawlDelayDurationInMillis(),
+ crawlDelayMechanism.getDelay());
}
-
+
@Test
public void testDelayBetweenRange() {
- // Return an in range delay
Mockito.when(mockedJsExecutor.executeScript(Mockito.anyString()))
.thenReturn(IN_RANGE_DELAY_DURATION_IN_MILLIS);
-
- // The in range delay should be returned
+
Assert.assertEquals(IN_RANGE_DELAY_DURATION_IN_MILLIS, crawlDelayMechanism.getDelay());
}
}
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanismTest.java b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java
similarity index 84%
rename from src/test/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanismTest.java
rename to src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java
index b2955bc..535f5f4 100644
--- a/src/test/java/com/github/peterbencze/serritor/internal/FixedCrawlDelayMechanismTest.java
+++ b/src/test/java/com/github/peterbencze/serritor/internal/crawldelaymechanism/FixedCrawlDelayMechanismTest.java
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2018 Peter Bencze.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,7 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.github.peterbencze.serritor.internal;
+
+package com.github.peterbencze.serritor.internal.crawldelaymechanism;
import com.github.peterbencze.serritor.api.CrawlerConfiguration;
import com.github.peterbencze.serritor.api.CrawlerConfiguration.CrawlerConfigurationBuilder;
@@ -23,24 +24,24 @@
import org.mockito.Mockito;
/**
- * Test cases for FixedCrawlDelayMechanism
.
- *
+ * Test cases for {@link FixedCrawlDelayMechanism}.
+ *
* @author Peter Bencze
*/
public class FixedCrawlDelayMechanismTest {
-
+
private CrawlerConfiguration config;
private FixedCrawlDelayMechanism crawlDelayMechanism;
-
+
@Before
public void initialize() {
config = Mockito.spy(new CrawlerConfigurationBuilder().build());
crawlDelayMechanism = new FixedCrawlDelayMechanism(config);
}
-
+
@Test
public void testGetDelay() {
- // The delay should be the same as in the configuration
- Assert.assertEquals(config.getFixedCrawlDelayDurationInMillis(), crawlDelayMechanism.getDelay());
+ Assert.assertEquals(config.getFixedCrawlDelayDurationInMillis(),
+ crawlDelayMechanism.getDelay());
}
}