From a30aaf0dfdb28b0bcfeb873696e93381e9bcdcf2 Mon Sep 17 00:00:00 2001 From: yihui Date: Fri, 7 Jul 2017 12:28:33 +0800 Subject: [PATCH] =?UTF-8?q?v0.005=20=E6=96=B0=E5=A2=9EJobCount=E7=94=A8?= =?UTF-8?q?=E4=BA=8E=E5=81=9C=E6=AD=A2=E6=A0=87=E5=BF=97=E4=BD=8D=E6=9B=B4?= =?UTF-8?q?=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 17 +++- .../hui/crawler/core/entity/CrawlMeta.java | 24 ++++- .../hui/crawler/core/fetcher/FetchQueue.java | 92 +++++++++++++++---- .../hui/crawler/core/fetcher/Fetcher.java | 12 +-- .../hui/crawler/core/fetcher/JobCount.java | 77 ++++++++++++++++ .../hui/crawler/core/filter/ResultFilter.java | 84 +++++++++++++++++ .../core/job/DefaultAbstractCrawlJob.java | 56 +---------- .../crawler/core/test/QueueCrawlerTest.java | 6 +- .../demo/book/webcollect/TestCrawl.java | 44 +++++++++ 9 files changed, 327 insertions(+), 85 deletions(-) create mode 100644 core/src/main/java/com/quick/hui/crawler/core/fetcher/JobCount.java create mode 100644 core/src/main/java/com/quick/hui/crawler/core/filter/ResultFilter.java create mode 100644 demo/src/main/java/com/quick/hui/crawler/demo/book/webcollect/TestCrawl.java diff --git a/README.md b/README.md index 8ed5ad8..c23a570 100644 --- a/README.md +++ b/README.md @@ -31,4 +31,19 @@ > 实现爬取队列 - 每个Job只执行当前网页的抓取,将网页中满足深度抓取的链接塞入队列 -- 新增Fetcher类,用于控制抓去任务 \ No newline at end of file +- 新增Fetcher类,用于控制抓去任务 + + +### 5. [v0.005](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.005) + +> 实现Job任务中爬取 + 结果解析的分离; 完成任务结束的标识设定 + +- 新增 ResultFilter 实现爬取网页的分析, 并将满足条件的链接塞入爬取队列 +- 新增 JobCount 来记录任务的爬取完成数, 以此完成整个任务的结束标识设定 + + +## 相关博文 + +- [Java 动手写爬虫: 一、实现一个最简单爬虫](http://zbang.online:8080/articles/2017/07/05/1499239054423.html) +- [Java 动手写爬虫: 二、 深度爬取](http://zbang.online:8080/articles/2017/07/05/1499239349163.html) +- [Java 动手写爬虫: 三、爬取队列](http://zbang.online:8080/articles/2017/07/07/1499401540323.html) diff --git a/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java index b5f5891..090449b 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java +++ b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java @@ -1,6 +1,7 @@ package com.quick.hui.crawler.core.entity; +import com.quick.hui.crawler.core.fetcher.JobCount; import lombok.*; import java.util.HashSet; @@ -11,9 +12,25 @@ * Created by yihui on 2017/6/27. */ @ToString -@NoArgsConstructor @AllArgsConstructor public class CrawlMeta { + + /** + * 当前任务对应的 {@link JobCount#id } + */ + @Getter + @Setter + private int jobId; + + + /** + * 当前任务对应的 {@link JobCount#parentId } + */ + @Getter + @Setter + private int parentJobId; + + /** * 当前爬取的深度 */ @@ -69,4 +86,9 @@ public Set addNegativeRegex(String regex) { this.negativeRegex.add(Pattern.compile(regex)); return this.negativeRegex; } + + + public CrawlMeta() { + this.jobId = JobCount.genId(); + } } diff --git a/core/src/main/java/com/quick/hui/crawler/core/fetcher/FetchQueue.java b/core/src/main/java/com/quick/hui/crawler/core/fetcher/FetchQueue.java index 30c215d..70a872c 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/fetcher/FetchQueue.java +++ b/core/src/main/java/com/quick/hui/crawler/core/fetcher/FetchQueue.java @@ -2,6 +2,7 @@ import com.quick.hui.crawler.core.entity.CrawlMeta; +import java.util.Map; import java.util.Queue; import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; @@ -25,13 +26,19 @@ public class FetchQueue { /** * 待爬取的网页队列 */ - private Queue toFetchQueue = new ArrayBlockingQueue<>(200); + private Queue toFetchQueue = new ArrayBlockingQueue<>(2000); -// -// /** -// * 爬取的结果队列,用于分析内部链接,并产出下一个可爬取的链接塞入 {@link FetchQueue#toFetchQueue} -// */ -// private Queue fetchResultQueue = new ArrayBlockingQueue<>(200); + + /** + * JobCount 映射表, key为 {@link JobCount#id}, value 为对应的JobCount + */ + public Map jobCountMap = new ConcurrentHashMap<>(); + + + /** + * 爬取是否完成的标识 + */ + public volatile boolean isOver = false; /** @@ -54,20 +61,22 @@ public static FetchQueue newInstance(String tag) { * 当没有爬取过时, 才丢入队列; 主要是避免重复爬取的问题 * * @param crawlMeta + * @return true 表示丢入队列成功; false 表示已经爬取过了 */ - public void addSeed(CrawlMeta crawlMeta) { + public boolean addSeed(CrawlMeta crawlMeta) { if (urls.contains(crawlMeta.getUrl())) { - return; + return false; } synchronized (this) { if (urls.contains(crawlMeta.getUrl())) { - return; + return false; } urls.add(crawlMeta.getUrl()); toFetchQueue.add(crawlMeta); + return true; } } @@ -76,13 +85,60 @@ public CrawlMeta pollSeed() { return toFetchQueue.poll(); } -// -// public void addResult(CrawlResult crawlResult) { -// this.fetchResultQueue.add(crawlResult); -// } -// -// -// public CrawlResult pollResult() { -// return fetchResultQueue.poll(); -// } + + public void finishJob(CrawlMeta crawlMeta, int count, int maxDepth) { + if (finishOneJob(crawlMeta, count, maxDepth)) { + isOver = true; + System.out.println("============ finish crawl! ======"); + } + } + + + /** + * 完成一个爬取任务 + * + * @param crawlMeta 爬取的任务 + * @param count 爬取的网页上满足继续爬取的链接数 + * @return 如果所有的都爬取完了, 则返回true + */ + private boolean finishOneJob(CrawlMeta crawlMeta, int count, int maxDepth) { + JobCount jobCount = new JobCount(crawlMeta.getJobId(), + crawlMeta.getParentJobId(), + crawlMeta.getCurrentDepth(), + count, 0); + jobCountMap.put(crawlMeta.getJobId(), jobCount); + + + if (crawlMeta.getCurrentDepth() == 0) { // 爬取种子页时,特判一下 + return count == 0; // 若没有子链接可以爬取, 则直接结束 + } + + + if (count == 0 || crawlMeta.getCurrentDepth() == maxDepth) { + // 当前的为最后一层的job时, 上一层计数+1 + return finishOneJob(jobCountMap.get(crawlMeta.getParentJobId())); + } + + + return false; + } + + + /** + * 递归向上进行任务完成 +1 + * + * @param jobCount + * @return true 表示所有的任务都爬取完成 + */ + private boolean finishOneJob(JobCount jobCount) { + if (jobCount.finishJob()) { + if (jobCount.getCurrentDepth() == 0) { + return true; // 结束 + } + + return finishOneJob(jobCountMap.get(jobCount.getParentId())); + } + + return false; + } } diff --git a/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java b/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java index e3401ab..471423b 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java +++ b/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java @@ -57,22 +57,16 @@ private void initExecutor() { } - public void start(Class clz) throws Exception { CrawlMeta crawlMeta; - int i = 0; - while (true) { + + while (!fetchQueue.isOver) { crawlMeta = fetchQueue.pollSeed(); if (crawlMeta == null) { Thread.sleep(200); - if (++i > 300) { // 连续一分钟内没有数据时,退出 - break; - } - continue; } - i = 0; DefaultAbstractCrawlJob job = clz.newInstance(); job.setDepth(this.maxDepth); @@ -84,7 +78,6 @@ public void start(Class clz) throws Excep } - private static class CustomThreadFactory implements ThreadFactory { private String name; @@ -102,7 +95,6 @@ public Thread newThread(Runnable r) { } - @Getter @Setter @ToString diff --git a/core/src/main/java/com/quick/hui/crawler/core/fetcher/JobCount.java b/core/src/main/java/com/quick/hui/crawler/core/fetcher/JobCount.java new file mode 100644 index 0000000..9f0c783 --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/fetcher/JobCount.java @@ -0,0 +1,77 @@ +package com.quick.hui.crawler.core.fetcher; + +import lombok.Getter; + +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Created by yihui on 2017/7/6. + */ +@Getter +public class JobCount { + + /** + * 种子对应的id + */ + public static int SEED_ID = 1; + + public static AtomicInteger idGen = new AtomicInteger(0); + + + public static int genId() { + return idGen.addAndGet(1); + } + + + /** + * 该Job对应的唯一ID + */ + private int id; + + + /** + * 该job对应父job的id + */ + private int parentId; + + + /** + * 当前的层数 + */ + private int currentDepth; + + + /** + * 该job对应的网页中,子Job的数量 + */ + private AtomicInteger jobCount = new AtomicInteger(0); + + + /** + * 该Job对应的网页中, 子Job完成的数量 + */ + private AtomicInteger finishCount = new AtomicInteger(0); + + + public boolean fetchOver() { + return jobCount.get() == finishCount.get(); + } + + + /** + * 爬取完成一个子任务 + */ + public synchronized boolean finishJob() { + finishCount.addAndGet(1); + return fetchOver(); + } + + + public JobCount(int id, int parentId, int currentDepth, int jobCount, int finishCount) { + this.id = id; + this.parentId = parentId; + this.currentDepth = currentDepth; + this.jobCount.set(jobCount); + this.finishCount.set(finishCount); + } +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/filter/ResultFilter.java b/core/src/main/java/com/quick/hui/crawler/core/filter/ResultFilter.java new file mode 100644 index 0000000..5f602c8 --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/filter/ResultFilter.java @@ -0,0 +1,84 @@ +package com.quick.hui.crawler.core.filter; + +import com.quick.hui.crawler.core.entity.CrawlMeta; +import com.quick.hui.crawler.core.entity.CrawlResult; +import com.quick.hui.crawler.core.fetcher.FetchQueue; +import com.quick.hui.crawler.core.fetcher.JobCount; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Created by yihui on 2017/7/6. + */ +public class ResultFilter { + + + public static void filter(CrawlMeta crawlMeta, + CrawlResult crawlResult, + FetchQueue fetchQueue, + int maxDepth) { + int count = 0; + try { + // 解析返回的网页中的链接,将满足条件的扔到爬取队列中 + int currentDepth = crawlMeta.getCurrentDepth(); + if (currentDepth >= maxDepth) { + return; + } + + + // 当前的网址中可以继续爬的链接数 + + Elements elements = crawlResult.getHtmlDoc().select("a[href]"); + String src; + for (Element element : elements) { + // 确保将相对地址转为绝对地址 + src = element.attr("abs:href"); + if (!matchRegex(crawlMeta, src)) { + continue; + } + + CrawlMeta meta = new CrawlMeta( + JobCount.genId(), + crawlMeta.getJobId(), + currentDepth + 1, + src, + crawlMeta.getSelectorRules(), + crawlMeta.getPositiveRegex(), + crawlMeta.getNegativeRegex()); + if (fetchQueue.addSeed(meta)) { + count++; + } + } + + } finally { // 上一层爬完计数+1 + fetchQueue.finishJob(crawlMeta, count, maxDepth); + } + + } + + + private static boolean matchRegex(CrawlMeta crawlMeta, String url) { + Matcher matcher; + for (Pattern pattern : crawlMeta.getPositiveRegex()) { + matcher = pattern.matcher(url); + if (matcher.find()) { + return true; + } + } + + + for (Pattern pattern : crawlMeta.getNegativeRegex()) { + matcher = pattern.matcher(url); + if (matcher.find()) { + return false; + } + } + + + return crawlMeta.getPositiveRegex().size() == 0; + } + +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java b/core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java index 5608b7e..cf8cb9e 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java +++ b/core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java @@ -4,6 +4,7 @@ import com.quick.hui.crawler.core.entity.CrawlMeta; import com.quick.hui.crawler.core.entity.CrawlResult; import com.quick.hui.crawler.core.fetcher.FetchQueue; +import com.quick.hui.crawler.core.filter.ResultFilter; import com.quick.hui.crawler.core.utils.HttpUtils; import lombok.Getter; import lombok.NoArgsConstructor; @@ -14,14 +15,11 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; /** * Created by yihui on 2017/6/29. @@ -45,7 +43,7 @@ public abstract class DefaultAbstractCrawlJob extends AbstractJob { /** * 配置项信息 */ - private CrawlMeta crawlMeta; + protected CrawlMeta crawlMeta; /** @@ -86,31 +84,8 @@ void doFetchPage() throws Exception { // 回调用户的网页内容解析方法 this.visit(this.crawlResult); - - - // 解析返回的网页中的链接,将满足条件的扔到爬取队列中 - int currentDepth = this.crawlMeta.getCurrentDepth(); - if (currentDepth > depth) { - return; - } - - - Elements elements = crawlResult.getHtmlDoc().select("a[href]"); - String src; - for (Element element : elements) { - // 确保将相对地址转为绝对地址 - src = element.attr("abs:href"); - if (!matchRegex(src)) { - continue; - } - - CrawlMeta meta = new CrawlMeta(currentDepth + 1, - src, - this.crawlMeta.getSelectorRules(), - this.crawlMeta.getPositiveRegex(), - this.crawlMeta.getNegativeRegex()); - fetchQueue.addSeed(meta); - } + // 结果过滤 + ResultFilter.filter(crawlMeta, crawlResult, fetchQueue, depth); } @@ -137,29 +112,6 @@ private CrawlResult doParse(String html, CrawlMeta meta) { } - private boolean matchRegex(String url) { - Matcher matcher; - for (Pattern pattern : crawlMeta.getPositiveRegex()) { - matcher = pattern.matcher(url); - if (matcher.find()) { - return true; - } - } - - - for (Pattern pattern : crawlMeta.getNegativeRegex()) { - matcher = pattern.matcher(url); - if (matcher.find()) { - return false; - } - } - - - return crawlMeta.getPositiveRegex().size() == 0; - } - - - protected void setResponseCode(String code) { httpConf.setCode(code); } diff --git a/core/src/test/java/com/quick/hui/crawler/core/test/QueueCrawlerTest.java b/core/src/test/java/com/quick/hui/crawler/core/test/QueueCrawlerTest.java index e3e4131..4bdcf2f 100644 --- a/core/src/test/java/com/quick/hui/crawler/core/test/QueueCrawlerTest.java +++ b/core/src/test/java/com/quick/hui/crawler/core/test/QueueCrawlerTest.java @@ -22,19 +22,19 @@ public void beforeRun() { @Override protected void visit(CrawlResult crawlResult) { - System.out.println(Thread.currentThread().getName() + " ___ " + crawlResult.getUrl()); + System.out.println(Thread.currentThread().getName() + "___" + crawlMeta.getCurrentDepth() + "___" + crawlResult.getUrl()); } } @Test public void testCrawel() throws Exception { - Fetcher fetcher = new Fetcher(1); + Fetcher fetcher = new Fetcher(2); String url = "http://chengyu.t086.com/gushi/1.htm"; CrawlMeta crawlMeta = new CrawlMeta(); crawlMeta.setUrl(url); - crawlMeta.addPositiveRegex("http://chengyu.t086.com/gushi/[0-9]+\\.htm$"); + crawlMeta.addPositiveRegex("http://chengyu.t086.com/gushi/[0-9]+\\.html$"); fetcher.addFeed(crawlMeta); diff --git a/demo/src/main/java/com/quick/hui/crawler/demo/book/webcollect/TestCrawl.java b/demo/src/main/java/com/quick/hui/crawler/demo/book/webcollect/TestCrawl.java new file mode 100644 index 0000000..10aecfd --- /dev/null +++ b/demo/src/main/java/com/quick/hui/crawler/demo/book/webcollect/TestCrawl.java @@ -0,0 +1,44 @@ +package com.quick.hui.crawler.demo.book.webcollect; + +import cn.edu.hfut.dmic.webcollector.model.CrawlDatums; +import cn.edu.hfut.dmic.webcollector.model.Page; +import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler; + +/** + * Created by yihui on 2017/7/6. + */ +public class TestCrawl extends BreadthCrawler { + /** + * 构造一个基于伯克利DB的爬虫 + * 伯克利DB文件夹为crawlPath,crawlPath中维护了历史URL等信息 + * 不同任务不要使用相同的crawlPath + * 两个使用相同crawlPath的爬虫并行爬取会产生错误 + * + * @param crawlPath 伯克利DB使用的文件夹 + * @param autoParse 是否根据设置的正则自动探测新URL + */ + public TestCrawl(String crawlPath, boolean autoParse) { + super(crawlPath, autoParse); + } + + @Override + public void visit(Page page, CrawlDatums next) { + System.out.println(page.getUrl()); + } + + + public static void main(String[] args) throws Exception { + String url = "http://chengyu.t086.com/gushi/1.htm"; + String regix = "http://chengyu.t086.com/gushi/[0-9]+\\.html$"; + + + BookSearch bookSearch = new BookSearch("book", true); + bookSearch.addSeed(url); + bookSearch.addRegex(regix); + + + bookSearch.setThreads(1); + bookSearch.setTopN(5000); + bookSearch.start(3); + } +}