From 07326300faccf016971e28fbb59487c99dc04f35 Mon Sep 17 00:00:00 2001 From: yihui Date: Sat, 8 Jul 2017 11:02:08 +0800 Subject: [PATCH] =?UTF-8?q?v0.006=20=E6=97=A5=E5=BF=97=E5=9F=8B=E7=82=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 8 +++++++- core/pom.xml | 16 ++++++++++++++++ .../hui/crawler/core/fetcher/FetchQueue.java | 10 ++++++++-- .../hui/crawler/core/fetcher/Fetcher.java | 15 +++++++++++++++ .../hui/crawler/core/filter/ResultFilter.java | 19 +++++++++++++++++++ .../core/job/DefaultAbstractCrawlJob.java | 17 +++++++++++++++++ .../crawler/core/test/QueueCrawlerTest.java | 2 +- .../hui/crawler/core/test/base/RegexTest.java | 5 ++++- core/src/test/resources/logback-test.xml | 16 ++++++++++++++++ 9 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 core/src/test/resources/logback-test.xml diff --git a/README.md b/README.md index c23a570..707f04d 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,14 @@ - 新增 JobCount 来记录任务的爬取完成数, 以此完成整个任务的结束标识设定 +### 6. [v0.006](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.005) + +> 添加日志埋点 + ## 相关博文 - [Java 动手写爬虫: 一、实现一个最简单爬虫](http://zbang.online:8080/articles/2017/07/05/1499239054423.html) -- [Java 动手写爬虫: 二、 深度爬取](http://zbang.online:8080/articles/2017/07/05/1499239349163.html) +- [Java 动手写爬虫: 二、深度爬取](http://zbang.online:8080/articles/2017/07/05/1499239349163.html) - [Java 动手写爬虫: 三、爬取队列](http://zbang.online:8080/articles/2017/07/07/1499401540323.html) + + diff --git a/core/pom.xml b/core/pom.xml index 046bb2d..6023b05 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -49,6 +49,22 @@ commons-collections 3.2.2 + + + + + + org.slf4j + slf4j-api + 1.7.21 + + + ch.qos.logback + logback-classic + 1.1.7 + + + diff --git a/core/src/main/java/com/quick/hui/crawler/core/fetcher/FetchQueue.java b/core/src/main/java/com/quick/hui/crawler/core/fetcher/FetchQueue.java index 70a872c..594966b 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/fetcher/FetchQueue.java +++ b/core/src/main/java/com/quick/hui/crawler/core/fetcher/FetchQueue.java @@ -1,6 +1,7 @@ package com.quick.hui.crawler.core.fetcher; import com.quick.hui.crawler.core.entity.CrawlMeta; +import lombok.extern.slf4j.Slf4j; import java.util.Map; import java.util.Queue; @@ -13,6 +14,7 @@ *

* Created by yihui on 2017/7/6. */ +@Slf4j public class FetchQueue { public static FetchQueue DEFAULT_INSTANCE = newInstance("default"); @@ -44,7 +46,7 @@ public class FetchQueue { /** * 所有爬取过的url集合, 用于去重 */ - private Set urls = ConcurrentHashMap.newKeySet(); + public Set urls = ConcurrentHashMap.newKeySet(); private FetchQueue(String tag) { @@ -57,6 +59,11 @@ public static FetchQueue newInstance(String tag) { } + public int size() { + return toFetchQueue.size(); + } + + /** * 当没有爬取过时, 才丢入队列; 主要是避免重复爬取的问题 * @@ -89,7 +96,6 @@ public CrawlMeta pollSeed() { public void finishJob(CrawlMeta crawlMeta, int count, int maxDepth) { if (finishOneJob(crawlMeta, count, maxDepth)) { isOver = true; - System.out.println("============ finish crawl! ======"); } } diff --git a/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java b/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java index 471423b..8fcbaac 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java +++ b/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java @@ -3,6 +3,7 @@ import com.quick.hui.crawler.core.entity.CrawlMeta; import com.quick.hui.crawler.core.job.DefaultAbstractCrawlJob; import lombok.*; +import lombok.extern.slf4j.Slf4j; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; @@ -10,6 +11,7 @@ /** * Created by yihui on 2017/6/27. */ +@Slf4j public class Fetcher { private int maxDepth; @@ -58,8 +60,17 @@ private void initExecutor() { public void start(Class clz) throws Exception { + long start = System.currentTimeMillis(); CrawlMeta crawlMeta; + if (fetchQueue.size() == 0) { + throw new IllegalArgumentException("please choose one seed to start crawling!"); + } + + + log.info(">>>>>>>>>>>> start crawl <<<<<<<<<<<<"); + + while (!fetchQueue.isOver) { crawlMeta = fetchQueue.pollSeed(); if (crawlMeta == null) { @@ -75,6 +86,10 @@ public void start(Class clz) throws Excep executor.execute(job); } + + + long end = System.currentTimeMillis(); + log.info(">>>>>>>>>>>> crawl over! total url num: {}, cost: {}ms <<<<<<<<<<<<", fetchQueue.urls.size(), end - start); } diff --git a/core/src/main/java/com/quick/hui/crawler/core/filter/ResultFilter.java b/core/src/main/java/com/quick/hui/crawler/core/filter/ResultFilter.java index 5f602c8..ec067cd 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/filter/ResultFilter.java +++ b/core/src/main/java/com/quick/hui/crawler/core/filter/ResultFilter.java @@ -4,6 +4,7 @@ import com.quick.hui.crawler.core.entity.CrawlResult; import com.quick.hui.crawler.core.fetcher.FetchQueue; import com.quick.hui.crawler.core.fetcher.JobCount; +import lombok.extern.slf4j.Slf4j; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; @@ -13,6 +14,7 @@ /** * Created by yihui on 2017/7/6. */ +@Slf4j public class ResultFilter { @@ -21,6 +23,7 @@ public static void filter(CrawlMeta crawlMeta, FetchQueue fetchQueue, int maxDepth) { int count = 0; + long start = System.currentTimeMillis(); try { // 解析返回的网页中的链接,将满足条件的扔到爬取队列中 int currentDepth = crawlMeta.getCurrentDepth(); @@ -49,12 +52,28 @@ public static void filter(CrawlMeta crawlMeta, crawlMeta.getPositiveRegex(), crawlMeta.getNegativeRegex()); if (fetchQueue.addSeed(meta)) { + + if (log.isDebugEnabled()) { + log.debug("put into queue! parentUrl:{} url: {} depth: {}", + crawlMeta.getUrl(), + src, + currentDepth + 1); + } + count++; } } } finally { // 上一层爬完计数+1 fetchQueue.finishJob(crawlMeta, count, maxDepth); + + long end = System.currentTimeMillis(); + if (log.isDebugEnabled()) { + log.debug("url {} subUrl counts: {}, filter result cost: {}ms, currentDepth: {} \n\n", + crawlMeta.getUrl(), + count, end - start, + crawlMeta.getCurrentDepth()); + } } } diff --git a/core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java b/core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java index cf8cb9e..c2c79ae 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java +++ b/core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java @@ -9,6 +9,7 @@ import lombok.Getter; import lombok.NoArgsConstructor; import lombok.Setter; +import lombok.extern.slf4j.Slf4j; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.util.EntityUtils; @@ -24,6 +25,7 @@ /** * Created by yihui on 2017/6/29. */ +@Slf4j @Getter @Setter @NoArgsConstructor @@ -67,8 +69,23 @@ public DefaultAbstractCrawlJob(int depth) { * 执行抓取网页 */ void doFetchPage() throws Exception { + if (log.isDebugEnabled()) { + log.debug("start crawl url: {}", crawlMeta.getUrl()); + } + + + long start = System.currentTimeMillis(); HttpResponse response = HttpUtils.request(this.crawlMeta, httpConf); String res = EntityUtils.toString(response.getEntity(), httpConf.getCode()); + long end = System.currentTimeMillis(); + if (log.isDebugEnabled()) { + log.debug("crawl url:{} response code: {} cost time: {} ms\n", + this.crawlMeta.getUrl(), + response.getStatusLine().getStatusCode(), + end - start ); + } + + if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { // 请求成功 this.crawlResult = new CrawlResult(); this.crawlResult.setStatus(response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase()); diff --git a/core/src/test/java/com/quick/hui/crawler/core/test/QueueCrawlerTest.java b/core/src/test/java/com/quick/hui/crawler/core/test/QueueCrawlerTest.java index 4bdcf2f..7991388 100644 --- a/core/src/test/java/com/quick/hui/crawler/core/test/QueueCrawlerTest.java +++ b/core/src/test/java/com/quick/hui/crawler/core/test/QueueCrawlerTest.java @@ -22,7 +22,7 @@ public void beforeRun() { @Override protected void visit(CrawlResult crawlResult) { - System.out.println(Thread.currentThread().getName() + "___" + crawlMeta.getCurrentDepth() + "___" + crawlResult.getUrl()); +// System.out.println(Thread.currentThread().getName() + "___" + crawlMeta.getCurrentDepth() + "___" + crawlResult.getUrl()); } } diff --git a/core/src/test/java/com/quick/hui/crawler/core/test/base/RegexTest.java b/core/src/test/java/com/quick/hui/crawler/core/test/base/RegexTest.java index 48e0ee9..c7bd17e 100644 --- a/core/src/test/java/com/quick/hui/crawler/core/test/base/RegexTest.java +++ b/core/src/test/java/com/quick/hui/crawler/core/test/base/RegexTest.java @@ -1,5 +1,6 @@ package com.quick.hui.crawler.core.test.base; +import lombok.extern.slf4j.Slf4j; import org.junit.Test; import java.util.regex.Matcher; @@ -8,6 +9,7 @@ /** * Created by yihui on 2017/6/29. */ +@Slf4j public class RegexTest { @Test @@ -24,7 +26,8 @@ public void testRegex() { Pattern patter = Pattern.compile(regex); for(String url: urls) { Matcher matcher = patter.matcher(url); - System.out.println(url + " " + matcher.find()); +// System.out.println(url + " " + matcher.find()); + log.debug("url:{} match result:{}", url, matcher.find()); } } diff --git a/core/src/test/resources/logback-test.xml b/core/src/test/resources/logback-test.xml new file mode 100644 index 0000000..d2f4e25 --- /dev/null +++ b/core/src/test/resources/logback-test.xml @@ -0,0 +1,16 @@ + + + + + + %-4relative [%thread] %-5level %logger{35} - %msg %n + + + + + + + + + + \ No newline at end of file