basicType = new HashSet<>();
+
+ static {
+ basicType.addAll(Arrays.asList(String.class,
+ Integer.class, int.class, Long.class, long.class,
+ Double.class, double.class, Float.class, float.class,
+ Character.class, char.class, Byte.class, byte.class,
+ Boolean.class, boolean.class, Short.class, short.class));
+ }
+
+
+ public static boolean isBasicType(Class clz) {
+ return basicType.contains(clz);
+ }
+}
diff --git a/berkely/src/main/java/com/quick/hui/crawler/berkely/BerkelyHelper.java b/berkely/src/main/java/com/quick/hui/crawler/berkely/BerkelyHelper.java
new file mode 100644
index 0000000..31a8b09
--- /dev/null
+++ b/berkely/src/main/java/com/quick/hui/crawler/berkely/BerkelyHelper.java
@@ -0,0 +1,191 @@
+package com.quick.hui.crawler.berkely;
+
+import com.sleepycat.bind.EntryBinding;
+import com.sleepycat.bind.serial.SerialBinding;
+import com.sleepycat.bind.serial.StoredClassCatalog;
+import com.sleepycat.bind.tuple.TupleBinding;
+import com.sleepycat.je.*;
+
+import java.io.File;
+import java.io.UnsupportedEncodingException;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * Berkely DB 辅助类
+ *
+ * Created by yihui on 2017/6/21.
+ */
+public class BerkelyHelper {
+
+ /**
+ * 用于管理DB
+ */
+ private Environment environment;
+
+
+ /**
+ * 类型转换日志DB
+ */
+ private static final String CLASS_CATALOG = "java_class_catalog";
+ private StoredClassCatalog classCatalog;
+ private Database catalogDb;
+
+
+ /**
+ * 本地缓存的bind映射表
+ */
+ private Map bindingMap = new ConcurrentHashMap<>();
+
+
+ private Database db;
+
+
+ public static BerkelyHelper newInstance(String path, String dbName) {
+ return new BerkelyHelper(path, dbName);
+ }
+
+
+ private BerkelyHelper(String path, String dbName) {
+ initEnv(path);
+ initDb(dbName);
+ }
+
+ private void initEnv(String path) {
+ File file = new File(path);
+ if ((!file.exists() || file.isFile()) && !file.mkdir()) { // 不存在,or为文件时, 创建一个对应的目录
+ throw new IllegalStateException("create dir:" + path + " error!");
+ }
+
+
+ EnvironmentConfig environmentConfig = new EnvironmentConfig();
+ environmentConfig.setAllowCreate(true);
+ environmentConfig.setTransactional(true);
+
+ environment = new Environment(file, environmentConfig);
+ }
+
+
+ private void initDb(String dbName) {
+ DatabaseConfig dbConfig = new DatabaseConfig();
+ dbConfig.setTransactional(true);
+ dbConfig.setAllowCreate(true);
+
+ catalogDb = environment.openDatabase(null, CLASS_CATALOG, dbConfig);
+ classCatalog = new StoredClassCatalog(catalogDb);
+
+
+ db = environment.openDatabase(null, dbName, dbConfig);
+ }
+
+
+ @SuppressWarnings("unchecked")
+ private EntryBinding getBinding(Class clz) {
+ if (bindingMap.containsKey(clz)) {
+ return bindingMap.get(clz);
+ }
+
+
+ EntryBinding binding;
+ if (BasicTypeHelper.isBasicType(clz)) {
+ binding = TupleBinding.getPrimitiveBinding(clz);
+ } else {
+ binding = new SerialBinding(classCatalog, clz);
+ }
+
+ bindingMap.put(clz, binding);
+ return binding;
+ }
+
+ private DatabaseEntry getKey(String key) {
+ DatabaseEntry kEntrty;
+ try {
+ kEntrty = new DatabaseEntry(key.getBytes("utf-8"));
+ } catch (Exception e) {
+ kEntrty = new DatabaseEntry(key.getBytes());
+ }
+ return kEntrty;
+ }
+
+
+ /**
+ * 往 DB 中塞数据
+ *
+ * @param key 塞入的标识key
+ * @param value 塞入的value
+ * @return
+ * @throws UnsupportedEncodingException
+ */
+ public OperationStatus put(String key, Object value) {
+ DatabaseEntry kEntrty = getKey(key);
+ DatabaseEntry vEntry = new DatabaseEntry();
+
+
+ EntryBinding binding = getBinding(value.getClass());
+ binding.objectToEntry(value, vEntry);
+ return db.put(null, kEntrty, vEntry);
+ }
+
+
+ /**
+ * 从 DB 中获取数据
+ *
+ * @param key 唯一标识
+ * @param valueClz 返回的数据类型
+ * @return
+ * @throws UnsupportedEncodingException
+ */
+ public T get(String key, Class valueClz) {
+ DatabaseEntry kEntrty = getKey(key);
+ DatabaseEntry vEntry = new DatabaseEntry();
+
+
+ OperationStatus status;
+ status = db.get(null, kEntrty, vEntry, LockMode.DEFAULT);
+
+ if (status != OperationStatus.SUCCESS) {
+ return null;
+ }
+
+ EntryBinding binding = getBinding(valueClz);
+ return binding.entryToObject(vEntry);
+ }
+
+
+ /**
+ * 删除数据
+ *
+ * @param key
+ */
+ public void delete(String key) {
+ DatabaseEntry kEntry = getKey(key);
+ db.delete(null, kEntry);
+ }
+
+
+ // 遍历db中所有数据的方法,因为不知道value的类型, 所以不太好处理,为了解决这个方法,可以在key里面冗余一个value类型的信息
+ public void scan() {
+
+ Cursor cursor = db.openCursor(null, CursorConfig.DEFAULT);
+ DatabaseEntry kEntry = new DatabaseEntry();
+ DatabaseEntry vEntry = new DatabaseEntry();
+
+ while (cursor.getNext(kEntry, vEntry, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
+ // key = xxx
+ String key = new String(kEntry.getData());
+ // value = xxx
+ }
+ }
+
+
+ /**
+ * 结束之前,得调用一下这个方法
+ */
+ public void close() {
+ db.close();
+ classCatalog.close();
+ environment.close();
+
+ System.out.println("释放资源");
+ }
+}
diff --git a/berkely/src/test/java/com/quick/hui/crawler/berkely/test/BerkelyHelperTest.java b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/BerkelyHelperTest.java
new file mode 100644
index 0000000..a7ccc3b
--- /dev/null
+++ b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/BerkelyHelperTest.java
@@ -0,0 +1,67 @@
+package com.quick.hui.crawler.berkely.test;
+
+import com.quick.hui.crawler.berkely.BerkelyHelper;
+import com.sleepycat.je.OperationStatus;
+import org.junit.Test;
+
+/**
+ * Created by yihui on 2017/6/21.
+ */
+public class BerkelyHelperTest {
+
+ @Test
+ public void testBerkelyHelper() {
+ String key = "test" + System.currentTimeMillis();
+ String value = "第一个测试";
+
+ BerkelyHelper berkelyHelper = BerkelyHelper.newInstance("logs", "myDB");
+ OperationStatus status = berkelyHelper.put(key, value);
+ System.out.println("首次塞入返回: " + status);
+
+
+ // 读取数据
+ String result = berkelyHelper.get(key, String.class);
+ System.out.println("the result: " + result);
+ System.out.println("-------------------------------------");
+
+
+ // 再次塞入
+ value = "更新后的测试";
+ status = berkelyHelper.put(key, value);
+ System.out.println("再次塞入返回: " + status);
+
+
+ result = berkelyHelper.get(key, String.class);
+ System.out.println("the result: " + result);
+ System.out.println("-------------------------------------");
+
+ // 更新塞入的数据类型
+ int value2 = 100;
+ status = berkelyHelper.put(key, value2);
+ System.out.println("更新塞入数据类型返回: " + status);
+
+
+ int result2 = berkelyHelper.get(key, int.class);
+ System.out.println("the result: " + result2);
+ System.out.println("-------------------------------------");
+
+
+ // 获取一个不存在的数据
+ result = berkelyHelper.get("no exists!", String.class);
+ System.out.println("the result: " + result);
+ System.out.println("-------------------------------------");
+
+
+ try {
+ // 输入错误的返回类型
+ result = berkelyHelper.get(key, String.class);
+ System.out.println("the result: " + result);
+ System.out.println("-------------------------------------");
+ } catch (Exception e) {
+ System.out.println("error! e" + e);
+ }
+
+ berkelyHelper.close();
+ }
+
+}
diff --git a/berkely/src/test/java/com/quick/hui/crawler/berkely/test/ObjDataTest.java b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/ObjDataTest.java
new file mode 100644
index 0000000..b5dd7bc
--- /dev/null
+++ b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/ObjDataTest.java
@@ -0,0 +1,103 @@
+package com.quick.hui.crawler.berkely.test;
+
+import com.quick.hui.crawler.berkely.test.entity.ToSaveDO;
+import com.sleepycat.bind.EntryBinding;
+import com.sleepycat.bind.serial.SerialBinding;
+import com.sleepycat.bind.serial.StoredClassCatalog;
+import com.sleepycat.bind.tuple.TupleBinding;
+import com.sleepycat.je.*;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Created by yihui on 2017/6/21.
+ */
+public class ObjDataTest {
+
+ private ToSaveDO build() {
+ ToSaveDO toSaveDO = new ToSaveDO();
+
+ List list = Arrays.asList("hello", "world", "123");
+ Map map = new HashMap<>();
+ map.put("key", 123);
+ map.put("content", "hello world");
+
+
+ toSaveDO.setId(100);
+ toSaveDO.setUnique(false);
+ toSaveDO.setNames(list);
+ toSaveDO.setMap(map);
+ toSaveDO.setTitle("title");
+ return toSaveDO;
+ }
+
+
+ @Test
+ public void test() throws UnsupportedEncodingException {
+ EnvironmentConfig config = new EnvironmentConfig();
+ config.setAllowCreate(true);
+
+ Environment env = new Environment(new File("log"), config);
+ DatabaseConfig dbConfig = new DatabaseConfig();
+ dbConfig.setAllowCreate(true);
+ Database db = env.openDatabase(null, "myDB", dbConfig);
+
+
+ String key = "akey";
+ Long data = 1234556633L;
+ DatabaseEntry keyEntry = new DatabaseEntry(key.getBytes("UTF-8"));
+ DatabaseEntry dataEntry = new DatabaseEntry();
+
+
+ try {
+ // 先写入数据
+ EntryBinding myBinding = TupleBinding.getPrimitiveBinding(Long.class);
+ myBinding.objectToEntry(data, dataEntry);
+
+
+ // 读取数据
+ EntryBinding binding = TupleBinding.getPrimitiveBinding(Long.class);
+ db.get(null, keyEntry, dataEntry, LockMode.DEFAULT);
+ Long l = (Long) binding.entryToObject(dataEntry);
+ System.out.println(l);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+
+ Database classDB = null;
+ try {
+ classDB = env.openDatabase(null, "classDB", dbConfig);
+ StoredClassCatalog classCatalog = new StoredClassCatalog(classDB);
+
+ ToSaveDO toSaveDO = build();
+ EntryBinding dataBinding = new SerialBinding(classCatalog, ToSaveDO.class);
+
+
+ // 写入对象
+ DatabaseEntry sKey = new DatabaseEntry(toSaveDO.getTitle().getBytes("UTF-8"));
+ DatabaseEntry sVal = new DatabaseEntry();
+ dataBinding.objectToEntry(toSaveDO, sVal);
+ db.put(null, sKey, sVal);
+
+
+ db.get(null, sKey, sVal, LockMode.DEFAULT);
+ ToSaveDO p = (ToSaveDO) dataBinding.entryToObject(sVal);
+ System.out.println(p);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+
+ classDB.close();
+ db.close();
+ env.close();
+ }
+}
+
diff --git a/berkely/src/test/java/com/quick/hui/crawler/berkely/test/SimpleDataTest.java b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/SimpleDataTest.java
new file mode 100644
index 0000000..a89a0ea
--- /dev/null
+++ b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/SimpleDataTest.java
@@ -0,0 +1,57 @@
+package com.quick.hui.crawler.berkely.test;
+
+import com.sleepycat.bind.EntryBinding;
+import com.sleepycat.bind.tuple.TupleBinding;
+import com.sleepycat.je.*;
+import org.junit.Test;
+
+import java.io.File;
+
+/**
+ * Created by yihui on 2017/6/21.
+ */
+public class SimpleDataTest {
+ private Environment env;
+ private Database db;
+ private String key = "akey";
+ private Long data = 1234556633L;
+
+ public void setUp() throws Exception {
+ EnvironmentConfig envConfig = new EnvironmentConfig();
+ envConfig.setAllowCreate(true);
+ env = new Environment(new File("log"), envConfig);
+ DatabaseConfig dbConfig = new DatabaseConfig();
+ dbConfig.setAllowCreate(true);
+ db = env.openDatabase(null, "myDB", dbConfig);
+ DatabaseEntry keyEntry = new DatabaseEntry(key.getBytes("UTF-8"));
+ DatabaseEntry dataEntry = new DatabaseEntry();
+
+ EntryBinding myBinding = TupleBinding.getPrimitiveBinding(Long.class);
+ myBinding.objectToEntry(data, dataEntry);
+ db.put(null, keyEntry, dataEntry);
+ }
+
+
+ public void testGet() throws Exception {
+ DatabaseEntry keyEntry = new DatabaseEntry(key.getBytes("UTF-8"));
+ DatabaseEntry dataEntry = new DatabaseEntry();
+ EntryBinding binding = TupleBinding.getPrimitiveBinding(Long.class);
+ db.get(null, keyEntry, dataEntry, LockMode.DEFAULT);
+ Long l = (Long) binding.entryToObject(dataEntry);
+ }
+
+
+ public void tearDown() throws Exception {
+ db.close();
+ env.truncateDatabase(null, "myDB", false);
+ env.close();
+ }
+
+
+ @Test
+ public void main() throws Exception {
+ SimpleDataTest simpleDataTest = new SimpleDataTest();
+ simpleDataTest.setUp();
+ simpleDataTest.testGet();
+ }
+}
diff --git a/berkely/src/test/java/com/quick/hui/crawler/berkely/test/entity/ToSaveDO.java b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/entity/ToSaveDO.java
new file mode 100644
index 0000000..3b6f741
--- /dev/null
+++ b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/entity/ToSaveDO.java
@@ -0,0 +1,30 @@
+package com.quick.hui.crawler.berkely.test.entity;
+
+
+import lombok.Getter;
+import lombok.Setter;
+import lombok.ToString;
+
+import java.io.Serializable;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Created by yihui on 2017/6/21.
+ */
+@Getter
+@Setter
+@ToString
+public class ToSaveDO implements Serializable {
+ private static final long serialVersionUID = 968345657199606335L;
+
+ private List names;
+
+ private int id;
+
+ private String title;
+
+ private Boolean unique;
+
+ private Map map;
+}
diff --git a/common/pom.xml b/common/pom.xml
new file mode 100644
index 0000000..c49f27d
--- /dev/null
+++ b/common/pom.xml
@@ -0,0 +1,16 @@
+
+
+
+ quick-crawler
+ com.quick.hui.crawler
+ 1.0-SNAPSHOT
+
+ 4.0.0
+
+ com.quick.hui.crawler
+ common
+
+
+
\ No newline at end of file
diff --git a/core/pom.xml b/core/pom.xml
new file mode 100644
index 0000000..31fb87a
--- /dev/null
+++ b/core/pom.xml
@@ -0,0 +1,38 @@
+
+
+
+ quick-crawler
+ com.quick.hui.crawler
+ 1.0-SNAPSHOT
+
+ 4.0.0
+
+ com.quick.hui.crawler
+ core
+
+
+
+
+
+ org.jsoup
+ jsoup
+ 1.9.2
+
+
+
+
+ org.projectlombok
+ lombok
+ 1.16.8
+
+
+ junit
+ junit
+ 4.12
+
+
+
+
+
\ No newline at end of file
diff --git a/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java
new file mode 100644
index 0000000..1e2d2ba
--- /dev/null
+++ b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java
@@ -0,0 +1,37 @@
+package com.quick.hui.crawler.core.entity;
+
+
+import lombok.Getter;
+import lombok.Setter;
+import lombok.ToString;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Created by yihui on 2017/6/27.
+ */
+@ToString
+public class CrawlMeta {
+
+ /**
+ * 待爬去的网址
+ */
+ @Getter
+ @Setter
+ private String url;
+
+
+ /**
+ * 获取指定内容的规则, 因为一个网页中,你可能获取多个不同的内容, 所以放在集合中
+ */
+ @Setter
+ private Set selectorRules;
+
+
+
+ public Set getSelectorRules() {
+ return selectorRules != null ? selectorRules : new HashSet<>();
+ }
+
+}
diff --git a/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlResult.java b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlResult.java
new file mode 100644
index 0000000..696dfbe
--- /dev/null
+++ b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlResult.java
@@ -0,0 +1,36 @@
+package com.quick.hui.crawler.core.entity;
+
+import lombok.Getter;
+import lombok.Setter;
+import lombok.ToString;
+import org.jsoup.nodes.Document;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Created by yihui on 2017/6/27.
+ */
+@Getter
+@Setter
+@ToString
+public class CrawlResult {
+
+ /**
+ * 爬取的网址
+ */
+ private String url;
+
+
+ /**
+ * 爬取的网址对应的 DOC 结构
+ */
+ private Document htmlDoc;
+
+
+ /**
+ * 选择的结果,key为选择规则,value为根据规则匹配的结果
+ */
+ private Map> result;
+
+}
diff --git a/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java b/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java
new file mode 100644
index 0000000..14d68df
--- /dev/null
+++ b/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java
@@ -0,0 +1,7 @@
+package com.quick.hui.crawler.core.fetcher;
+
+/**
+ * Created by yihui on 2017/6/27.
+ */
+public class Fetcher {
+}
diff --git a/core/src/main/java/com/quick/hui/crawler/core/job/AbstractJob.java b/core/src/main/java/com/quick/hui/crawler/core/job/AbstractJob.java
new file mode 100644
index 0000000..6b7d83b
--- /dev/null
+++ b/core/src/main/java/com/quick/hui/crawler/core/job/AbstractJob.java
@@ -0,0 +1,37 @@
+package com.quick.hui.crawler.core.job;
+
+/**
+ * Created by yihui on 2017/6/27.
+ */
+public abstract class AbstractJob implements IJob {
+
+ public void beforeRun() {
+ }
+
+ public void afterRun() {
+ }
+
+
+ @Override
+ public void run() {
+ this.beforeRun();
+
+
+ try {
+ this.doFetchPage();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+
+ this.afterRun();
+ }
+
+
+ /**
+ * 具体的抓去网页的方法, 需要子类来补全实现逻辑
+ *
+ * @throws Exception
+ */
+ public abstract void doFetchPage() throws Exception;
+}
diff --git a/core/src/main/java/com/quick/hui/crawler/core/job/IJob.java b/core/src/main/java/com/quick/hui/crawler/core/job/IJob.java
new file mode 100644
index 0000000..eecd09c
--- /dev/null
+++ b/core/src/main/java/com/quick/hui/crawler/core/job/IJob.java
@@ -0,0 +1,18 @@
+package com.quick.hui.crawler.core.job;
+
+/**
+ * Created by yihui on 2017/6/27.
+ */
+public interface IJob extends Runnable {
+
+ /**
+ * 在job执行之前回调的方法
+ */
+ void beforeRun();
+
+
+ /**
+ * 在job执行完毕之后回调的方法
+ */
+ void afterRun();
+}
diff --git a/core/src/main/java/com/quick/hui/crawler/core/job/SimpleCrawlJob.java b/core/src/main/java/com/quick/hui/crawler/core/job/SimpleCrawlJob.java
new file mode 100644
index 0000000..5e8268e
--- /dev/null
+++ b/core/src/main/java/com/quick/hui/crawler/core/job/SimpleCrawlJob.java
@@ -0,0 +1,109 @@
+package com.quick.hui.crawler.core.job;
+
+import com.quick.hui.crawler.core.entity.CrawlMeta;
+import com.quick.hui.crawler.core.entity.CrawlResult;
+import lombok.Getter;
+import lombok.Setter;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * 最简单的一个爬虫任务
+ *
+ * Created by yihui on 2017/6/27.
+ */
+@Getter
+@Setter
+public class SimpleCrawlJob extends AbstractJob {
+
+ /**
+ * 配置项信息
+ */
+ private CrawlMeta crawlMeta;
+
+
+ /**
+ * 存储爬取的结果
+ */
+ private CrawlResult crawlResult;
+
+
+ /**
+ * 执行抓取网页
+ */
+ public void doFetchPage() throws Exception {
+
+ URL url = new URL(crawlMeta.getUrl());
+ HttpURLConnection connection = (HttpURLConnection) url.openConnection();
+ BufferedReader in = null;
+
+ StringBuilder result = new StringBuilder();
+
+ try {
+ // 设置通用的请求属性
+ connection.setRequestProperty("accept", "*/*");
+ connection.setRequestProperty("connection", "Keep-Alive");
+ connection.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
+ // 建立实际的连接
+ connection.connect();
+
+
+ Map> map = connection.getHeaderFields();
+ //遍历所有的响应头字段
+ for (String key : map.keySet()) {
+ System.out.println(key + "--->" + map.get(key));
+ }
+
+ // 定义 BufferedReader输入流来读取URL的响应
+ in = new BufferedReader(new InputStreamReader(
+ connection.getInputStream()));
+ String line;
+ while ((line = in.readLine()) != null) {
+ result.append(line);
+ }
+ } finally { // 使用finally块来关闭输入流
+ try {
+ if (in != null) {
+ in.close();
+ }
+ } catch (Exception e2) {
+ e2.printStackTrace();
+ }
+ }
+
+
+ doParse(result.toString());
+ }
+
+
+
+ private void doParse(String html) {
+ Document doc = Jsoup.parse(html);
+
+ Map> map = new HashMap<>(crawlMeta.getSelectorRules().size());
+ for (String rule: crawlMeta.getSelectorRules()) {
+ List list = new ArrayList<>();
+ for (Element element: doc.select(rule)) {
+ list.add(element.text());
+ }
+
+ map.put(rule, list);
+ }
+
+
+ this.crawlResult = new CrawlResult();
+ this.crawlResult.setHtmlDoc(doc);
+ this.crawlResult.setUrl(crawlMeta.getUrl());
+ this.crawlResult.setResult(map);
+ }
+}
diff --git a/core/src/test/java/com/quick/hui/crawler/core/test/BaseCrawlerTest.java b/core/src/test/java/com/quick/hui/crawler/core/test/BaseCrawlerTest.java
new file mode 100644
index 0000000..23dfa82
--- /dev/null
+++ b/core/src/test/java/com/quick/hui/crawler/core/test/BaseCrawlerTest.java
@@ -0,0 +1,46 @@
+package com.quick.hui.crawler.core.test;
+
+import com.quick.hui.crawler.core.entity.CrawlMeta;
+import com.quick.hui.crawler.core.entity.CrawlResult;
+import com.quick.hui.crawler.core.job.SimpleCrawlJob;
+import org.junit.Test;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Created by yihui on 2017/6/27.
+ */
+public class BaseCrawlerTest {
+
+
+ /**
+ * 测试我们写的最简单的一个爬虫,
+ *
+ * 目标是爬取一篇博客
+ */
+ @Test
+ public void testFetch() throws InterruptedException {
+ String url = "https://my.oschina.net/u/566591/blog/1031575";
+ Set selectRule = new HashSet<>();
+ selectRule.add("div[class=title]"); // 博客标题
+ selectRule.add("div[class=blog-body]"); // 博客正文
+
+ CrawlMeta crawlMeta = new CrawlMeta();
+ crawlMeta.setUrl(url);
+ crawlMeta.setSelectorRules(selectRule);
+
+
+ SimpleCrawlJob job = new SimpleCrawlJob();
+ job.setCrawlMeta(crawlMeta);
+ Thread thread = new Thread(job, "crawler-test");
+ thread.start();
+
+ thread.join();
+
+
+ CrawlResult result = job.getCrawlResult();
+ System.out.println(result);
+ }
+
+}
diff --git a/demo/pom.xml b/demo/pom.xml
new file mode 100644
index 0000000..55e46e0
--- /dev/null
+++ b/demo/pom.xml
@@ -0,0 +1,80 @@
+
+
+
+ quick-crawler
+ com.quick.hui.crawler
+ 1.0-SNAPSHOT
+
+ 4.0.0
+
+ com.quick.hui.crawler
+ demo
+
+
+
+
+
+ cn.edu.hfut.dmic.webcollector
+ WebCollector
+ 2.52
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+
+
+
+
+ edu.uci.ics
+ crawler4j
+ 4.1
+
+
+
+
+ log4j
+ log4j
+ 1.2.17
+
+
+
+
+ org.slf4j
+ slf4j-log4j12
+ 1.7.21
+
+
+
+
+
+ com.belerweb
+ pinyin4j
+ 2.5.0
+
+
+
+
+ org.apache.commons
+ commons-lang3
+ 3.5
+
+
+
+
+ org.slf4j
+ slf4j-api
+ 1.7.21
+
+
+ ch.qos.logback
+ logback-classic
+ 1.1.3
+
+
+
+
+
\ No newline at end of file
diff --git a/demo/src/main/java/com/quick/hui/crawler/demo/book/crawler4j/Book4jCrawler.java b/demo/src/main/java/com/quick/hui/crawler/demo/book/crawler4j/Book4jCrawler.java
new file mode 100644
index 0000000..3d738be
--- /dev/null
+++ b/demo/src/main/java/com/quick/hui/crawler/demo/book/crawler4j/Book4jCrawler.java
@@ -0,0 +1,91 @@
+package com.quick.hui.crawler.demo.book.crawler4j;
+
+import edu.uci.ics.crawler4j.crawler.CrawlConfig;
+import edu.uci.ics.crawler4j.crawler.CrawlController;
+import edu.uci.ics.crawler4j.crawler.Page;
+import edu.uci.ics.crawler4j.crawler.WebCrawler;
+import edu.uci.ics.crawler4j.fetcher.PageFetcher;
+import edu.uci.ics.crawler4j.parser.HtmlParseData;
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
+import edu.uci.ics.crawler4j.url.WebURL;
+
+import java.util.Set;
+import java.util.regex.Pattern;
+
+/**
+ * Created by yihui on 2017/6/27.
+ */
+public class Book4jCrawler extends WebCrawler {
+ /**
+ * 正则匹配指定的后缀文件
+ */
+ private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"
+ + "|png|mp3|mp3|zip|gz))$");
+
+ /**
+ * 这个方法主要是决定哪些url我们需要抓取,返回true表示是我们需要的,返回false表示不是我们需要的Url
+ * 第一个参数referringPage封装了当前爬取的页面信息
+ * 第二个参数url封装了当前爬取的页面url信息
+ */
+ @Override
+ public boolean shouldVisit(Page referringPage, WebURL url) {
+ String href = url.getURL().toLowerCase(); // 得到小写的url
+ return !FILTERS.matcher(href).matches() // 正则匹配,过滤掉我们不需要的后缀文件
+ && href.startsWith("http://www.java1234.com/"); // url必须是http://www.java1234.com/开头,规定站点
+ }
+
+ /**
+ * 当我们爬到我们需要的页面,这个方法会被调用,我们可以尽情的处理这个页面
+ * page参数封装了所有页面信息
+ */
+ @Override
+ public void visit(Page page) {
+ String url = page.getWebURL().getURL(); // 获取url
+ System.out.println("URL: " + url);
+
+ if (page.getParseData() instanceof HtmlParseData) { // 判断是否是html数据
+ HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); // 强制类型转换,获取html数据对象
+ String text = htmlParseData.getText(); // 获取页面纯文本(无html标签)
+ String html = htmlParseData.getHtml(); // 获取页面Html
+ Set links = htmlParseData.getOutgoingUrls(); // 获取页面输出链接
+
+ System.out.println("纯文本长度: " + text.length());
+ System.out.println("html长度: " + html.length());
+ System.out.println("输出链接个数: " + links.size());
+ }
+ }
+
+
+ public static void main(String[] args) throws Exception {
+ String crawlStorageFolder = "/tmp/crawl"; // 定义爬虫数据存储位置
+ int numberOfCrawlers = 7; // 定义7个爬虫,也就是7个线程
+
+ CrawlConfig config = new CrawlConfig(); // 定义爬虫配置
+ config.setCrawlStorageFolder(crawlStorageFolder); // 设置爬虫文件存储位置
+
+ /*
+ * 实例化爬虫控制器
+ */
+ PageFetcher pageFetcher = new PageFetcher(config); // 实例化页面获取器
+ RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); // 实例化爬虫机器人配置 比如可以设置 user-agent
+
+ // 实例化爬虫机器人对目标服务器的配置,每个网站都有一个robots.txt文件 规定了该网站哪些页面可以爬,哪些页面禁止爬,该类是对robots.txt规范的实现
+ RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
+ // 实例化爬虫控制器
+ CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
+
+ /**
+ * 配置爬虫种子页面,就是规定的从哪里开始爬,可以配置多个种子页面
+ */
+ controller.addSeed("http://www.xxbiquge.com/8_8947/");
+ controller.addSeed("http://www.xxbiquge.com/8_8947/1547593.html");
+// controller.addSeed("http://www.java1234.com/a/kaiyuan/");
+// controller.addSeed("http://www.java1234.com/a/bysj/");
+
+ /**
+ * 启动爬虫,爬虫从此刻开始执行爬虫任务,根据以上配置
+ */
+ controller.start(Book4jCrawler.class, numberOfCrawlers);
+ }
+}
diff --git a/demo/src/main/java/com/quick/hui/crawler/demo/book/webcollect/BookSearch.java b/demo/src/main/java/com/quick/hui/crawler/demo/book/webcollect/BookSearch.java
new file mode 100644
index 0000000..20d2d18
--- /dev/null
+++ b/demo/src/main/java/com/quick/hui/crawler/demo/book/webcollect/BookSearch.java
@@ -0,0 +1,58 @@
+package com.quick.hui.crawler.demo.book.webcollect;
+
+import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
+import cn.edu.hfut.dmic.webcollector.model.Page;
+import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * 爬取小说的爬虫
+ *
+ * Created by yihui on 2017/6/21.
+ */
+public class BookSearch extends BreadthCrawler {
+
+ /**
+ * 构造一个基于伯克利DB的爬虫
+ * 伯克利DB文件夹为crawlPath,crawlPath中维护了历史URL等信息
+ * 不同任务不要使用相同的crawlPath
+ * 两个使用相同crawlPath的爬虫并行爬取会产生错误
+ *
+ * @param crawlPath 伯克利DB使用的文件夹
+ * @param autoParse 是否根据设置的正则自动探测新URL
+ */
+ public BookSearch(String crawlPath, boolean autoParse) {
+ super(crawlPath, autoParse);
+ }
+
+
+
+ public void visit(Page page, CrawlDatums next) {
+ if (!page.matchUrl(regix)) {
+ return;
+ }
+
+ page.charset("utf-8");
+
+ String title = page.select("div[class=bookname]>h1").text();
+ String content = page.select("div[id=content]").html();
+ content = StringUtils.replace(content, " ", "\n\t");
+
+ System.out.println("title: " + title + " content: " + content);
+ }
+
+
+ static String regix = "http://www.xxbiquge.com/8_8947/[0-9]+.html";
+ public static void main(String[] args) throws Exception {
+ String seed = "http://www.xxbiquge.com/8_8947/";
+
+ BookSearch bookSearch = new BookSearch("book", true);
+ bookSearch.addSeed(seed);
+ bookSearch.addRegex(regix);
+
+
+ bookSearch.setThreads(1);
+ bookSearch.setTopN(5000);
+ bookSearch.start(2);
+ }
+}
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..71e4f72
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,32 @@
+
+
+ 4.0.0
+
+ com.quick.hui.crawler
+ quick-crawler
+ pom
+ 1.0-SNAPSHOT
+
+ core
+ common
+ demo
+ berkely
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+
+ 1.8
+
+
+
+
+
+
+
\ No newline at end of file