diff --git a/berkely/pom.xml b/berkely/pom.xml new file mode 100644 index 0000000..052af15 --- /dev/null +++ b/berkely/pom.xml @@ -0,0 +1,36 @@ + + + + quick-crawler + com.quick.hui.crawler + 1.0-SNAPSHOT + + 4.0.0 + + com.quick.hui.crawler + berkely + + + + + je + com.sleepycat + 6.1.5 + + + org.projectlombok + lombok + 1.16.6 + + + junit + junit + 4.4 + test + + + + + \ No newline at end of file diff --git a/berkely/src/main/java/com/quick/hui/crawler/berkely/BasicTypeHelper.java b/berkely/src/main/java/com/quick/hui/crawler/berkely/BasicTypeHelper.java new file mode 100644 index 0000000..d562049 --- /dev/null +++ b/berkely/src/main/java/com/quick/hui/crawler/berkely/BasicTypeHelper.java @@ -0,0 +1,26 @@ +package com.quick.hui.crawler.berkely; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +/** + * Created by yihui on 2017/6/21. + */ +public class BasicTypeHelper { + + public static Set basicType = new HashSet<>(); + + static { + basicType.addAll(Arrays.asList(String.class, + Integer.class, int.class, Long.class, long.class, + Double.class, double.class, Float.class, float.class, + Character.class, char.class, Byte.class, byte.class, + Boolean.class, boolean.class, Short.class, short.class)); + } + + + public static boolean isBasicType(Class clz) { + return basicType.contains(clz); + } +} diff --git a/berkely/src/main/java/com/quick/hui/crawler/berkely/BerkelyHelper.java b/berkely/src/main/java/com/quick/hui/crawler/berkely/BerkelyHelper.java new file mode 100644 index 0000000..31a8b09 --- /dev/null +++ b/berkely/src/main/java/com/quick/hui/crawler/berkely/BerkelyHelper.java @@ -0,0 +1,191 @@ +package com.quick.hui.crawler.berkely; + +import com.sleepycat.bind.EntryBinding; +import com.sleepycat.bind.serial.SerialBinding; +import com.sleepycat.bind.serial.StoredClassCatalog; +import com.sleepycat.bind.tuple.TupleBinding; +import com.sleepycat.je.*; + +import java.io.File; +import java.io.UnsupportedEncodingException; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Berkely DB 辅助类 + *

+ * Created by yihui on 2017/6/21. + */ +public class BerkelyHelper { + + /** + * 用于管理DB + */ + private Environment environment; + + + /** + * 类型转换日志DB + */ + private static final String CLASS_CATALOG = "java_class_catalog"; + private StoredClassCatalog classCatalog; + private Database catalogDb; + + + /** + * 本地缓存的bind映射表 + */ + private Map bindingMap = new ConcurrentHashMap<>(); + + + private Database db; + + + public static BerkelyHelper newInstance(String path, String dbName) { + return new BerkelyHelper(path, dbName); + } + + + private BerkelyHelper(String path, String dbName) { + initEnv(path); + initDb(dbName); + } + + private void initEnv(String path) { + File file = new File(path); + if ((!file.exists() || file.isFile()) && !file.mkdir()) { // 不存在,or为文件时, 创建一个对应的目录 + throw new IllegalStateException("create dir:" + path + " error!"); + } + + + EnvironmentConfig environmentConfig = new EnvironmentConfig(); + environmentConfig.setAllowCreate(true); + environmentConfig.setTransactional(true); + + environment = new Environment(file, environmentConfig); + } + + + private void initDb(String dbName) { + DatabaseConfig dbConfig = new DatabaseConfig(); + dbConfig.setTransactional(true); + dbConfig.setAllowCreate(true); + + catalogDb = environment.openDatabase(null, CLASS_CATALOG, dbConfig); + classCatalog = new StoredClassCatalog(catalogDb); + + + db = environment.openDatabase(null, dbName, dbConfig); + } + + + @SuppressWarnings("unchecked") + private EntryBinding getBinding(Class clz) { + if (bindingMap.containsKey(clz)) { + return bindingMap.get(clz); + } + + + EntryBinding binding; + if (BasicTypeHelper.isBasicType(clz)) { + binding = TupleBinding.getPrimitiveBinding(clz); + } else { + binding = new SerialBinding(classCatalog, clz); + } + + bindingMap.put(clz, binding); + return binding; + } + + private DatabaseEntry getKey(String key) { + DatabaseEntry kEntrty; + try { + kEntrty = new DatabaseEntry(key.getBytes("utf-8")); + } catch (Exception e) { + kEntrty = new DatabaseEntry(key.getBytes()); + } + return kEntrty; + } + + + /** + * 往 DB 中塞数据 + * + * @param key 塞入的标识key + * @param value 塞入的value + * @return + * @throws UnsupportedEncodingException + */ + public OperationStatus put(String key, Object value) { + DatabaseEntry kEntrty = getKey(key); + DatabaseEntry vEntry = new DatabaseEntry(); + + + EntryBinding binding = getBinding(value.getClass()); + binding.objectToEntry(value, vEntry); + return db.put(null, kEntrty, vEntry); + } + + + /** + * 从 DB 中获取数据 + * + * @param key 唯一标识 + * @param valueClz 返回的数据类型 + * @return + * @throws UnsupportedEncodingException + */ + public T get(String key, Class valueClz) { + DatabaseEntry kEntrty = getKey(key); + DatabaseEntry vEntry = new DatabaseEntry(); + + + OperationStatus status; + status = db.get(null, kEntrty, vEntry, LockMode.DEFAULT); + + if (status != OperationStatus.SUCCESS) { + return null; + } + + EntryBinding binding = getBinding(valueClz); + return binding.entryToObject(vEntry); + } + + + /** + * 删除数据 + * + * @param key + */ + public void delete(String key) { + DatabaseEntry kEntry = getKey(key); + db.delete(null, kEntry); + } + + + // 遍历db中所有数据的方法,因为不知道value的类型, 所以不太好处理,为了解决这个方法,可以在key里面冗余一个value类型的信息 + public void scan() { + + Cursor cursor = db.openCursor(null, CursorConfig.DEFAULT); + DatabaseEntry kEntry = new DatabaseEntry(); + DatabaseEntry vEntry = new DatabaseEntry(); + + while (cursor.getNext(kEntry, vEntry, LockMode.DEFAULT) == OperationStatus.SUCCESS) { + // key = xxx + String key = new String(kEntry.getData()); + // value = xxx + } + } + + + /** + * 结束之前,得调用一下这个方法 + */ + public void close() { + db.close(); + classCatalog.close(); + environment.close(); + + System.out.println("释放资源"); + } +} diff --git a/berkely/src/test/java/com/quick/hui/crawler/berkely/test/BerkelyHelperTest.java b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/BerkelyHelperTest.java new file mode 100644 index 0000000..a7ccc3b --- /dev/null +++ b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/BerkelyHelperTest.java @@ -0,0 +1,67 @@ +package com.quick.hui.crawler.berkely.test; + +import com.quick.hui.crawler.berkely.BerkelyHelper; +import com.sleepycat.je.OperationStatus; +import org.junit.Test; + +/** + * Created by yihui on 2017/6/21. + */ +public class BerkelyHelperTest { + + @Test + public void testBerkelyHelper() { + String key = "test" + System.currentTimeMillis(); + String value = "第一个测试"; + + BerkelyHelper berkelyHelper = BerkelyHelper.newInstance("logs", "myDB"); + OperationStatus status = berkelyHelper.put(key, value); + System.out.println("首次塞入返回: " + status); + + + // 读取数据 + String result = berkelyHelper.get(key, String.class); + System.out.println("the result: " + result); + System.out.println("-------------------------------------"); + + + // 再次塞入 + value = "更新后的测试"; + status = berkelyHelper.put(key, value); + System.out.println("再次塞入返回: " + status); + + + result = berkelyHelper.get(key, String.class); + System.out.println("the result: " + result); + System.out.println("-------------------------------------"); + + // 更新塞入的数据类型 + int value2 = 100; + status = berkelyHelper.put(key, value2); + System.out.println("更新塞入数据类型返回: " + status); + + + int result2 = berkelyHelper.get(key, int.class); + System.out.println("the result: " + result2); + System.out.println("-------------------------------------"); + + + // 获取一个不存在的数据 + result = berkelyHelper.get("no exists!", String.class); + System.out.println("the result: " + result); + System.out.println("-------------------------------------"); + + + try { + // 输入错误的返回类型 + result = berkelyHelper.get(key, String.class); + System.out.println("the result: " + result); + System.out.println("-------------------------------------"); + } catch (Exception e) { + System.out.println("error! e" + e); + } + + berkelyHelper.close(); + } + +} diff --git a/berkely/src/test/java/com/quick/hui/crawler/berkely/test/ObjDataTest.java b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/ObjDataTest.java new file mode 100644 index 0000000..b5dd7bc --- /dev/null +++ b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/ObjDataTest.java @@ -0,0 +1,103 @@ +package com.quick.hui.crawler.berkely.test; + +import com.quick.hui.crawler.berkely.test.entity.ToSaveDO; +import com.sleepycat.bind.EntryBinding; +import com.sleepycat.bind.serial.SerialBinding; +import com.sleepycat.bind.serial.StoredClassCatalog; +import com.sleepycat.bind.tuple.TupleBinding; +import com.sleepycat.je.*; +import org.junit.Test; + +import java.io.File; +import java.io.UnsupportedEncodingException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Created by yihui on 2017/6/21. + */ +public class ObjDataTest { + + private ToSaveDO build() { + ToSaveDO toSaveDO = new ToSaveDO(); + + List list = Arrays.asList("hello", "world", "123"); + Map map = new HashMap<>(); + map.put("key", 123); + map.put("content", "hello world"); + + + toSaveDO.setId(100); + toSaveDO.setUnique(false); + toSaveDO.setNames(list); + toSaveDO.setMap(map); + toSaveDO.setTitle("title"); + return toSaveDO; + } + + + @Test + public void test() throws UnsupportedEncodingException { + EnvironmentConfig config = new EnvironmentConfig(); + config.setAllowCreate(true); + + Environment env = new Environment(new File("log"), config); + DatabaseConfig dbConfig = new DatabaseConfig(); + dbConfig.setAllowCreate(true); + Database db = env.openDatabase(null, "myDB", dbConfig); + + + String key = "akey"; + Long data = 1234556633L; + DatabaseEntry keyEntry = new DatabaseEntry(key.getBytes("UTF-8")); + DatabaseEntry dataEntry = new DatabaseEntry(); + + + try { + // 先写入数据 + EntryBinding myBinding = TupleBinding.getPrimitiveBinding(Long.class); + myBinding.objectToEntry(data, dataEntry); + + + // 读取数据 + EntryBinding binding = TupleBinding.getPrimitiveBinding(Long.class); + db.get(null, keyEntry, dataEntry, LockMode.DEFAULT); + Long l = (Long) binding.entryToObject(dataEntry); + System.out.println(l); + } catch (Exception e) { + e.printStackTrace(); + } + + + Database classDB = null; + try { + classDB = env.openDatabase(null, "classDB", dbConfig); + StoredClassCatalog classCatalog = new StoredClassCatalog(classDB); + + ToSaveDO toSaveDO = build(); + EntryBinding dataBinding = new SerialBinding(classCatalog, ToSaveDO.class); + + + // 写入对象 + DatabaseEntry sKey = new DatabaseEntry(toSaveDO.getTitle().getBytes("UTF-8")); + DatabaseEntry sVal = new DatabaseEntry(); + dataBinding.objectToEntry(toSaveDO, sVal); + db.put(null, sKey, sVal); + + + db.get(null, sKey, sVal, LockMode.DEFAULT); + ToSaveDO p = (ToSaveDO) dataBinding.entryToObject(sVal); + System.out.println(p); + } catch (Exception e) { + e.printStackTrace(); + } + + + classDB.close(); + db.close(); + env.close(); + } +} + diff --git a/berkely/src/test/java/com/quick/hui/crawler/berkely/test/SimpleDataTest.java b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/SimpleDataTest.java new file mode 100644 index 0000000..a89a0ea --- /dev/null +++ b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/SimpleDataTest.java @@ -0,0 +1,57 @@ +package com.quick.hui.crawler.berkely.test; + +import com.sleepycat.bind.EntryBinding; +import com.sleepycat.bind.tuple.TupleBinding; +import com.sleepycat.je.*; +import org.junit.Test; + +import java.io.File; + +/** + * Created by yihui on 2017/6/21. + */ +public class SimpleDataTest { + private Environment env; + private Database db; + private String key = "akey"; + private Long data = 1234556633L; + + public void setUp() throws Exception { + EnvironmentConfig envConfig = new EnvironmentConfig(); + envConfig.setAllowCreate(true); + env = new Environment(new File("log"), envConfig); + DatabaseConfig dbConfig = new DatabaseConfig(); + dbConfig.setAllowCreate(true); + db = env.openDatabase(null, "myDB", dbConfig); + DatabaseEntry keyEntry = new DatabaseEntry(key.getBytes("UTF-8")); + DatabaseEntry dataEntry = new DatabaseEntry(); + + EntryBinding myBinding = TupleBinding.getPrimitiveBinding(Long.class); + myBinding.objectToEntry(data, dataEntry); + db.put(null, keyEntry, dataEntry); + } + + + public void testGet() throws Exception { + DatabaseEntry keyEntry = new DatabaseEntry(key.getBytes("UTF-8")); + DatabaseEntry dataEntry = new DatabaseEntry(); + EntryBinding binding = TupleBinding.getPrimitiveBinding(Long.class); + db.get(null, keyEntry, dataEntry, LockMode.DEFAULT); + Long l = (Long) binding.entryToObject(dataEntry); + } + + + public void tearDown() throws Exception { + db.close(); + env.truncateDatabase(null, "myDB", false); + env.close(); + } + + + @Test + public void main() throws Exception { + SimpleDataTest simpleDataTest = new SimpleDataTest(); + simpleDataTest.setUp(); + simpleDataTest.testGet(); + } +} diff --git a/berkely/src/test/java/com/quick/hui/crawler/berkely/test/entity/ToSaveDO.java b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/entity/ToSaveDO.java new file mode 100644 index 0000000..3b6f741 --- /dev/null +++ b/berkely/src/test/java/com/quick/hui/crawler/berkely/test/entity/ToSaveDO.java @@ -0,0 +1,30 @@ +package com.quick.hui.crawler.berkely.test.entity; + + +import lombok.Getter; +import lombok.Setter; +import lombok.ToString; + +import java.io.Serializable; +import java.util.List; +import java.util.Map; + +/** + * Created by yihui on 2017/6/21. + */ +@Getter +@Setter +@ToString +public class ToSaveDO implements Serializable { + private static final long serialVersionUID = 968345657199606335L; + + private List names; + + private int id; + + private String title; + + private Boolean unique; + + private Map map; +} diff --git a/common/pom.xml b/common/pom.xml new file mode 100644 index 0000000..c49f27d --- /dev/null +++ b/common/pom.xml @@ -0,0 +1,16 @@ + + + + quick-crawler + com.quick.hui.crawler + 1.0-SNAPSHOT + + 4.0.0 + + com.quick.hui.crawler + common + + + \ No newline at end of file diff --git a/core/pom.xml b/core/pom.xml new file mode 100644 index 0000000..31fb87a --- /dev/null +++ b/core/pom.xml @@ -0,0 +1,38 @@ + + + + quick-crawler + com.quick.hui.crawler + 1.0-SNAPSHOT + + 4.0.0 + + com.quick.hui.crawler + core + + + + + + org.jsoup + jsoup + 1.9.2 + + + + + org.projectlombok + lombok + 1.16.8 + + + junit + junit + 4.12 + + + + + \ No newline at end of file diff --git a/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java new file mode 100644 index 0000000..1e2d2ba --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java @@ -0,0 +1,37 @@ +package com.quick.hui.crawler.core.entity; + + +import lombok.Getter; +import lombok.Setter; +import lombok.ToString; + +import java.util.HashSet; +import java.util.Set; + +/** + * Created by yihui on 2017/6/27. + */ +@ToString +public class CrawlMeta { + + /** + * 待爬去的网址 + */ + @Getter + @Setter + private String url; + + + /** + * 获取指定内容的规则, 因为一个网页中,你可能获取多个不同的内容, 所以放在集合中 + */ + @Setter + private Set selectorRules; + + + + public Set getSelectorRules() { + return selectorRules != null ? selectorRules : new HashSet<>(); + } + +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlResult.java b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlResult.java new file mode 100644 index 0000000..696dfbe --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlResult.java @@ -0,0 +1,36 @@ +package com.quick.hui.crawler.core.entity; + +import lombok.Getter; +import lombok.Setter; +import lombok.ToString; +import org.jsoup.nodes.Document; + +import java.util.List; +import java.util.Map; + +/** + * Created by yihui on 2017/6/27. + */ +@Getter +@Setter +@ToString +public class CrawlResult { + + /** + * 爬取的网址 + */ + private String url; + + + /** + * 爬取的网址对应的 DOC 结构 + */ + private Document htmlDoc; + + + /** + * 选择的结果,key为选择规则,value为根据规则匹配的结果 + */ + private Map> result; + +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java b/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java new file mode 100644 index 0000000..14d68df --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java @@ -0,0 +1,7 @@ +package com.quick.hui.crawler.core.fetcher; + +/** + * Created by yihui on 2017/6/27. + */ +public class Fetcher { +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/job/AbstractJob.java b/core/src/main/java/com/quick/hui/crawler/core/job/AbstractJob.java new file mode 100644 index 0000000..6b7d83b --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/job/AbstractJob.java @@ -0,0 +1,37 @@ +package com.quick.hui.crawler.core.job; + +/** + * Created by yihui on 2017/6/27. + */ +public abstract class AbstractJob implements IJob { + + public void beforeRun() { + } + + public void afterRun() { + } + + + @Override + public void run() { + this.beforeRun(); + + + try { + this.doFetchPage(); + } catch (Exception e) { + e.printStackTrace(); + } + + + this.afterRun(); + } + + + /** + * 具体的抓去网页的方法, 需要子类来补全实现逻辑 + * + * @throws Exception + */ + public abstract void doFetchPage() throws Exception; +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/job/IJob.java b/core/src/main/java/com/quick/hui/crawler/core/job/IJob.java new file mode 100644 index 0000000..eecd09c --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/job/IJob.java @@ -0,0 +1,18 @@ +package com.quick.hui.crawler.core.job; + +/** + * Created by yihui on 2017/6/27. + */ +public interface IJob extends Runnable { + + /** + * 在job执行之前回调的方法 + */ + void beforeRun(); + + + /** + * 在job执行完毕之后回调的方法 + */ + void afterRun(); +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/job/SimpleCrawlJob.java b/core/src/main/java/com/quick/hui/crawler/core/job/SimpleCrawlJob.java new file mode 100644 index 0000000..5e8268e --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/job/SimpleCrawlJob.java @@ -0,0 +1,109 @@ +package com.quick.hui.crawler.core.job; + +import com.quick.hui.crawler.core.entity.CrawlMeta; +import com.quick.hui.crawler.core.entity.CrawlResult; +import lombok.Getter; +import lombok.Setter; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * 最简单的一个爬虫任务 + *

+ * Created by yihui on 2017/6/27. + */ +@Getter +@Setter +public class SimpleCrawlJob extends AbstractJob { + + /** + * 配置项信息 + */ + private CrawlMeta crawlMeta; + + + /** + * 存储爬取的结果 + */ + private CrawlResult crawlResult; + + + /** + * 执行抓取网页 + */ + public void doFetchPage() throws Exception { + + URL url = new URL(crawlMeta.getUrl()); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + BufferedReader in = null; + + StringBuilder result = new StringBuilder(); + + try { + // 设置通用的请求属性 + connection.setRequestProperty("accept", "*/*"); + connection.setRequestProperty("connection", "Keep-Alive"); + connection.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); + // 建立实际的连接 + connection.connect(); + + + Map> map = connection.getHeaderFields(); + //遍历所有的响应头字段 + for (String key : map.keySet()) { + System.out.println(key + "--->" + map.get(key)); + } + + // 定义 BufferedReader输入流来读取URL的响应 + in = new BufferedReader(new InputStreamReader( + connection.getInputStream())); + String line; + while ((line = in.readLine()) != null) { + result.append(line); + } + } finally { // 使用finally块来关闭输入流 + try { + if (in != null) { + in.close(); + } + } catch (Exception e2) { + e2.printStackTrace(); + } + } + + + doParse(result.toString()); + } + + + + private void doParse(String html) { + Document doc = Jsoup.parse(html); + + Map> map = new HashMap<>(crawlMeta.getSelectorRules().size()); + for (String rule: crawlMeta.getSelectorRules()) { + List list = new ArrayList<>(); + for (Element element: doc.select(rule)) { + list.add(element.text()); + } + + map.put(rule, list); + } + + + this.crawlResult = new CrawlResult(); + this.crawlResult.setHtmlDoc(doc); + this.crawlResult.setUrl(crawlMeta.getUrl()); + this.crawlResult.setResult(map); + } +} diff --git a/core/src/test/java/com/quick/hui/crawler/core/test/BaseCrawlerTest.java b/core/src/test/java/com/quick/hui/crawler/core/test/BaseCrawlerTest.java new file mode 100644 index 0000000..23dfa82 --- /dev/null +++ b/core/src/test/java/com/quick/hui/crawler/core/test/BaseCrawlerTest.java @@ -0,0 +1,46 @@ +package com.quick.hui.crawler.core.test; + +import com.quick.hui.crawler.core.entity.CrawlMeta; +import com.quick.hui.crawler.core.entity.CrawlResult; +import com.quick.hui.crawler.core.job.SimpleCrawlJob; +import org.junit.Test; + +import java.util.HashSet; +import java.util.Set; + +/** + * Created by yihui on 2017/6/27. + */ +public class BaseCrawlerTest { + + + /** + * 测试我们写的最简单的一个爬虫, + * + * 目标是爬取一篇博客 + */ + @Test + public void testFetch() throws InterruptedException { + String url = "https://my.oschina.net/u/566591/blog/1031575"; + Set selectRule = new HashSet<>(); + selectRule.add("div[class=title]"); // 博客标题 + selectRule.add("div[class=blog-body]"); // 博客正文 + + CrawlMeta crawlMeta = new CrawlMeta(); + crawlMeta.setUrl(url); + crawlMeta.setSelectorRules(selectRule); + + + SimpleCrawlJob job = new SimpleCrawlJob(); + job.setCrawlMeta(crawlMeta); + Thread thread = new Thread(job, "crawler-test"); + thread.start(); + + thread.join(); + + + CrawlResult result = job.getCrawlResult(); + System.out.println(result); + } + +} diff --git a/demo/pom.xml b/demo/pom.xml new file mode 100644 index 0000000..55e46e0 --- /dev/null +++ b/demo/pom.xml @@ -0,0 +1,80 @@ + + + + quick-crawler + com.quick.hui.crawler + 1.0-SNAPSHOT + + 4.0.0 + + com.quick.hui.crawler + demo + + + + + + cn.edu.hfut.dmic.webcollector + WebCollector + 2.52 + + + org.slf4j + slf4j-log4j12 + + + + + + + edu.uci.ics + crawler4j + 4.1 + + + + + log4j + log4j + 1.2.17 + + + + + org.slf4j + slf4j-log4j12 + 1.7.21 + + + + + + com.belerweb + pinyin4j + 2.5.0 + + + + + org.apache.commons + commons-lang3 + 3.5 + + + + + org.slf4j + slf4j-api + 1.7.21 + + + ch.qos.logback + logback-classic + 1.1.3 + + + + + \ No newline at end of file diff --git a/demo/src/main/java/com/quick/hui/crawler/demo/book/crawler4j/Book4jCrawler.java b/demo/src/main/java/com/quick/hui/crawler/demo/book/crawler4j/Book4jCrawler.java new file mode 100644 index 0000000..3d738be --- /dev/null +++ b/demo/src/main/java/com/quick/hui/crawler/demo/book/crawler4j/Book4jCrawler.java @@ -0,0 +1,91 @@ +package com.quick.hui.crawler.demo.book.crawler4j; + +import edu.uci.ics.crawler4j.crawler.CrawlConfig; +import edu.uci.ics.crawler4j.crawler.CrawlController; +import edu.uci.ics.crawler4j.crawler.Page; +import edu.uci.ics.crawler4j.crawler.WebCrawler; +import edu.uci.ics.crawler4j.fetcher.PageFetcher; +import edu.uci.ics.crawler4j.parser.HtmlParseData; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; +import edu.uci.ics.crawler4j.url.WebURL; + +import java.util.Set; +import java.util.regex.Pattern; + +/** + * Created by yihui on 2017/6/27. + */ +public class Book4jCrawler extends WebCrawler { + /** + * 正则匹配指定的后缀文件 + */ + private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg" + + "|png|mp3|mp3|zip|gz))$"); + + /** + * 这个方法主要是决定哪些url我们需要抓取,返回true表示是我们需要的,返回false表示不是我们需要的Url + * 第一个参数referringPage封装了当前爬取的页面信息 + * 第二个参数url封装了当前爬取的页面url信息 + */ + @Override + public boolean shouldVisit(Page referringPage, WebURL url) { + String href = url.getURL().toLowerCase(); // 得到小写的url + return !FILTERS.matcher(href).matches() // 正则匹配,过滤掉我们不需要的后缀文件 + && href.startsWith("http://www.java1234.com/"); // url必须是http://www.java1234.com/开头,规定站点 + } + + /** + * 当我们爬到我们需要的页面,这个方法会被调用,我们可以尽情的处理这个页面 + * page参数封装了所有页面信息 + */ + @Override + public void visit(Page page) { + String url = page.getWebURL().getURL(); // 获取url + System.out.println("URL: " + url); + + if (page.getParseData() instanceof HtmlParseData) { // 判断是否是html数据 + HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); // 强制类型转换,获取html数据对象 + String text = htmlParseData.getText(); // 获取页面纯文本(无html标签) + String html = htmlParseData.getHtml(); // 获取页面Html + Set links = htmlParseData.getOutgoingUrls(); // 获取页面输出链接 + + System.out.println("纯文本长度: " + text.length()); + System.out.println("html长度: " + html.length()); + System.out.println("输出链接个数: " + links.size()); + } + } + + + public static void main(String[] args) throws Exception { + String crawlStorageFolder = "/tmp/crawl"; // 定义爬虫数据存储位置 + int numberOfCrawlers = 7; // 定义7个爬虫,也就是7个线程 + + CrawlConfig config = new CrawlConfig(); // 定义爬虫配置 + config.setCrawlStorageFolder(crawlStorageFolder); // 设置爬虫文件存储位置 + + /* + * 实例化爬虫控制器 + */ + PageFetcher pageFetcher = new PageFetcher(config); // 实例化页面获取器 + RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); // 实例化爬虫机器人配置 比如可以设置 user-agent + + // 实例化爬虫机器人对目标服务器的配置,每个网站都有一个robots.txt文件 规定了该网站哪些页面可以爬,哪些页面禁止爬,该类是对robots.txt规范的实现 + RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); + // 实例化爬虫控制器 + CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); + + /** + * 配置爬虫种子页面,就是规定的从哪里开始爬,可以配置多个种子页面 + */ + controller.addSeed("http://www.xxbiquge.com/8_8947/"); + controller.addSeed("http://www.xxbiquge.com/8_8947/1547593.html"); +// controller.addSeed("http://www.java1234.com/a/kaiyuan/"); +// controller.addSeed("http://www.java1234.com/a/bysj/"); + + /** + * 启动爬虫,爬虫从此刻开始执行爬虫任务,根据以上配置 + */ + controller.start(Book4jCrawler.class, numberOfCrawlers); + } +} diff --git a/demo/src/main/java/com/quick/hui/crawler/demo/book/webcollect/BookSearch.java b/demo/src/main/java/com/quick/hui/crawler/demo/book/webcollect/BookSearch.java new file mode 100644 index 0000000..20d2d18 --- /dev/null +++ b/demo/src/main/java/com/quick/hui/crawler/demo/book/webcollect/BookSearch.java @@ -0,0 +1,58 @@ +package com.quick.hui.crawler.demo.book.webcollect; + +import cn.edu.hfut.dmic.webcollector.model.CrawlDatums; +import cn.edu.hfut.dmic.webcollector.model.Page; +import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler; +import org.apache.commons.lang3.StringUtils; + +/** + * 爬取小说的爬虫 + * + * Created by yihui on 2017/6/21. + */ +public class BookSearch extends BreadthCrawler { + + /** + * 构造一个基于伯克利DB的爬虫 + * 伯克利DB文件夹为crawlPath,crawlPath中维护了历史URL等信息 + * 不同任务不要使用相同的crawlPath + * 两个使用相同crawlPath的爬虫并行爬取会产生错误 + * + * @param crawlPath 伯克利DB使用的文件夹 + * @param autoParse 是否根据设置的正则自动探测新URL + */ + public BookSearch(String crawlPath, boolean autoParse) { + super(crawlPath, autoParse); + } + + + + public void visit(Page page, CrawlDatums next) { + if (!page.matchUrl(regix)) { + return; + } + + page.charset("utf-8"); + + String title = page.select("div[class=bookname]>h1").text(); + String content = page.select("div[id=content]").html(); + content = StringUtils.replace(content, "    ", "\n\t"); + + System.out.println("title: " + title + " content: " + content); + } + + + static String regix = "http://www.xxbiquge.com/8_8947/[0-9]+.html"; + public static void main(String[] args) throws Exception { + String seed = "http://www.xxbiquge.com/8_8947/"; + + BookSearch bookSearch = new BookSearch("book", true); + bookSearch.addSeed(seed); + bookSearch.addRegex(regix); + + + bookSearch.setThreads(1); + bookSearch.setTopN(5000); + bookSearch.start(2); + } +} diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..71e4f72 --- /dev/null +++ b/pom.xml @@ -0,0 +1,32 @@ + + + 4.0.0 + + com.quick.hui.crawler + quick-crawler + pom + 1.0-SNAPSHOT + + core + common + demo + berkely + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.8 + 1.8 + + + + + + + \ No newline at end of file