-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #35 from peterbencze/development
Merging development into master
- Loading branch information
Showing
17 changed files
with
1,801 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/target/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,63 @@ | ||
# Serritor | ||
Serritor | ||
======== | ||
|
||
Serritor is an open source web crawler framework built upon [Selenium](http://www.seleniumhq.org/) and written in Java. Crawling dynamic web pages is no longer a problem! | ||
|
||
## Installation | ||
### Using Maven | ||
|
||
Add the following dependency to your pom.xml: | ||
```xml | ||
<dependency> | ||
<groupId>com.github.peterbencze</groupId> | ||
<artifactId>serritor</artifactId> | ||
<version>1.0</version> | ||
</dependency> | ||
``` | ||
|
||
### Without Maven | ||
|
||
The standalone JAR files are available on the [releases](https://github.com/peterbencze/serritor/releases) page. | ||
|
||
## Documentation | ||
See the [Wiki](https://github.com/peterbencze/serritor/wiki) page. | ||
|
||
## Quickstart | ||
BaseCrawler provides a skeletal implementation of a crawler to minimize the effort to create your own. First, create a class that extends BaseCrawler. In this class, you can customize the behavior of your crawler. There are callbacks available for every stage of crawling. Below you can find a sample implementation: | ||
```java | ||
public class MyCrawler extends BaseCrawler { | ||
|
||
public MyCrawler() { | ||
config.addSeedAsString("http://yourspecificwebsite.com"); | ||
config.setFilterOffsiteRequests(true); | ||
} | ||
|
||
@Override | ||
protected void onResponseComplete(HtmlResponse response) { | ||
List<WebElement> links = response.getWebDriver().findElements(By.tagName("a")); | ||
links.stream().forEach((WebElement link) -> crawlUrlAsString(link.getAttribute("href"))); | ||
} | ||
|
||
@Override | ||
protected void onNonHtmlResponse(NonHtmlResponse response) { | ||
System.out.println("Received a non-HTML response from: " + response.getUrl()); | ||
} | ||
|
||
@Override | ||
protected void onUnsuccessfulRequest(UnsuccessfulRequest request) { | ||
System.out.println("Could not get response from: " + request.getUrl()); | ||
} | ||
} | ||
``` | ||
That's it! In just a few lines you can make a crawler that extracts and crawls every URL it finds, while filtering duplicate and offsite requests. You also get access to the WebDriver, so you can use all the features that are provided by Selenium. | ||
|
||
By default, the crawler uses [HtmlUnitDriver](https://github.com/SeleniumHQ/selenium/wiki/HtmlUnitDriver) but you can also set your preferred WebDriver: | ||
```java | ||
config.setWebDriver(new ChromeDriver()); | ||
``` | ||
|
||
## Support | ||
The developers would like to thank [Precognox](http://precognox.com/) for the support. | ||
|
||
## License | ||
The source code of Serritor is made available under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,133 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<groupId>com.serritor</groupId> | ||
<groupId>com.github.peterbencze</groupId> | ||
<artifactId>serritor</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
<version>1.0</version> | ||
<packaging>jar</packaging> | ||
|
||
<name>Serritor</name> | ||
<description>An open source web crawler framework built upon Selenium and written in Java</description> | ||
<url>https://github.com/peterbencze/serritor</url> | ||
|
||
<licenses> | ||
<license> | ||
<name>Apache License, Version 2.0</name> | ||
<url>https://www.apache.org/licenses/LICENSE-2.0</url> | ||
</license> | ||
</licenses> | ||
|
||
<developers> | ||
<developer> | ||
<name>Peter Bencze</name> | ||
<email>[email protected]</email> | ||
<roles> | ||
<role>Owner</role> | ||
</roles> | ||
</developer> | ||
<developer> | ||
<name>Krisztian Mozsi</name> | ||
<email>[email protected]</email> | ||
<roles> | ||
<role>Committer</role> | ||
</roles> | ||
</developer> | ||
</developers> | ||
|
||
<scm> | ||
<connection>scm:git:git://github.com/peterbencze/serritor.git</connection> | ||
<developerConnection>scm:git:https://github.com/peterbencze/serritor.git</developerConnection> | ||
<url>https://github.com/peterbencze/serritor/tree/master</url> | ||
</scm> | ||
|
||
<distributionManagement> | ||
<snapshotRepository> | ||
<id>ossrh</id> | ||
<url>https://oss.sonatype.org/content/repositories/snapshots</url> | ||
</snapshotRepository> | ||
<repository> | ||
<id>ossrh</id> | ||
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url> | ||
</repository> | ||
</distributionManagement> | ||
|
||
<properties> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
<maven.compiler.source>1.7</maven.compiler.source> | ||
<maven.compiler.target>1.7</maven.compiler.target> | ||
<maven.compiler.source>1.8</maven.compiler.source> | ||
<maven.compiler.target>1.8</maven.compiler.target> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.seleniumhq.selenium</groupId> | ||
<artifactId>selenium-java</artifactId> | ||
<version>3.0.0-beta2</version> | ||
<version>3.0.0-beta3</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.seleniumhq.selenium</groupId> | ||
<artifactId>htmlunit-driver</artifactId> | ||
<version>2.23</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.eclipse.jetty.websocket</groupId> | ||
<artifactId>websocket-client</artifactId> | ||
<version>9.3.11.v20160721</version> | ||
</dependency> | ||
</dependencies> | ||
|
||
<build> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-source-plugin</artifactId> | ||
<version>2.4</version> | ||
<executions> | ||
<execution> | ||
<id>attach-source</id> | ||
<goals> | ||
<goal>jar-no-fork</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-javadoc-plugin</artifactId> | ||
<version>2.10.4</version> | ||
<executions> | ||
<execution> | ||
<id>attach-javadoc</id> | ||
<goals> | ||
<goal>jar</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-gpg-plugin</artifactId> | ||
<version>1.6</version> | ||
<executions> | ||
<execution> | ||
<id>sign-artifacts</id> | ||
<phase>verify</phase> | ||
<goals> | ||
<goal>sign</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.sonatype.plugins</groupId> | ||
<artifactId>nexus-staging-maven-plugin</artifactId> | ||
<version>1.6.7</version> | ||
<extensions>true</extensions> | ||
<configuration> | ||
<serverId>ossrh</serverId> | ||
<nexusUrl>https://oss.sonatype.org/</nexusUrl> | ||
<autoReleaseAfterClose>true</autoReleaseAfterClose> | ||
</configuration> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
</project> |
Oops, something went wrong.