Skip to content

Commit

Permalink
v0.006 日志埋点
Browse files Browse the repository at this point in the history
  • Loading branch information
yihui committed Jul 8, 2017
1 parent a30aaf0 commit 0732630
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 5 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,14 @@
- 新增 JobCount 来记录任务的爬取完成数, 以此完成整个任务的结束标识设定


### 6. [v0.006](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.005)

> 添加日志埋点
## 相关博文

- [Java 动手写爬虫: 一、实现一个最简单爬虫](http://zbang.online:8080/articles/2017/07/05/1499239054423.html)
- [Java 动手写爬虫: 二、 深度爬取](http://zbang.online:8080/articles/2017/07/05/1499239349163.html)
- [Java 动手写爬虫: 二、深度爬取](http://zbang.online:8080/articles/2017/07/05/1499239349163.html)
- [Java 动手写爬虫: 三、爬取队列](http://zbang.online:8080/articles/2017/07/07/1499401540323.html)


16 changes: 16 additions & 0 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,22 @@
<artifactId>commons-collections</artifactId>
<version>3.2.2</version>
</dependency>



<!--日志-->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.21</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.1.7</version>
</dependency>


</dependencies>


Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.quick.hui.crawler.core.fetcher;

import com.quick.hui.crawler.core.entity.CrawlMeta;
import lombok.extern.slf4j.Slf4j;

import java.util.Map;
import java.util.Queue;
Expand All @@ -13,6 +14,7 @@
* <p>
* Created by yihui on 2017/7/6.
*/
@Slf4j
public class FetchQueue {

public static FetchQueue DEFAULT_INSTANCE = newInstance("default");
Expand Down Expand Up @@ -44,7 +46,7 @@ public class FetchQueue {
/**
* 所有爬取过的url集合, 用于去重
*/
private Set<String> urls = ConcurrentHashMap.newKeySet();
public Set<String> urls = ConcurrentHashMap.newKeySet();


private FetchQueue(String tag) {
Expand All @@ -57,6 +59,11 @@ public static FetchQueue newInstance(String tag) {
}


public int size() {
return toFetchQueue.size();
}


/**
* 当没有爬取过时, 才丢入队列; 主要是避免重复爬取的问题
*
Expand Down Expand Up @@ -89,7 +96,6 @@ public CrawlMeta pollSeed() {
public void finishJob(CrawlMeta crawlMeta, int count, int maxDepth) {
if (finishOneJob(crawlMeta, count, maxDepth)) {
isOver = true;
System.out.println("============ finish crawl! ======");
}
}

Expand Down
15 changes: 15 additions & 0 deletions core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import com.quick.hui.crawler.core.entity.CrawlMeta;
import com.quick.hui.crawler.core.job.DefaultAbstractCrawlJob;
import lombok.*;
import lombok.extern.slf4j.Slf4j;

import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;

/**
* Created by yihui on 2017/6/27.
*/
@Slf4j
public class Fetcher {

private int maxDepth;
Expand Down Expand Up @@ -58,8 +60,17 @@ private void initExecutor() {


public <T extends DefaultAbstractCrawlJob> void start(Class<T> clz) throws Exception {
long start = System.currentTimeMillis();
CrawlMeta crawlMeta;

if (fetchQueue.size() == 0) {
throw new IllegalArgumentException("please choose one seed to start crawling!");
}


log.info(">>>>>>>>>>>> start crawl <<<<<<<<<<<<");


while (!fetchQueue.isOver) {
crawlMeta = fetchQueue.pollSeed();
if (crawlMeta == null) {
Expand All @@ -75,6 +86,10 @@ public <T extends DefaultAbstractCrawlJob> void start(Class<T> clz) throws Excep

executor.execute(job);
}


long end = System.currentTimeMillis();
log.info(">>>>>>>>>>>> crawl over! total url num: {}, cost: {}ms <<<<<<<<<<<<", fetchQueue.urls.size(), end - start);
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import com.quick.hui.crawler.core.entity.CrawlResult;
import com.quick.hui.crawler.core.fetcher.FetchQueue;
import com.quick.hui.crawler.core.fetcher.JobCount;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

Expand All @@ -13,6 +14,7 @@
/**
* Created by yihui on 2017/7/6.
*/
@Slf4j
public class ResultFilter {


Expand All @@ -21,6 +23,7 @@ public static void filter(CrawlMeta crawlMeta,
FetchQueue fetchQueue,
int maxDepth) {
int count = 0;
long start = System.currentTimeMillis();
try {
// 解析返回的网页中的链接,将满足条件的扔到爬取队列中
int currentDepth = crawlMeta.getCurrentDepth();
Expand Down Expand Up @@ -49,12 +52,28 @@ public static void filter(CrawlMeta crawlMeta,
crawlMeta.getPositiveRegex(),
crawlMeta.getNegativeRegex());
if (fetchQueue.addSeed(meta)) {

if (log.isDebugEnabled()) {
log.debug("put into queue! parentUrl:{} url: {} depth: {}",
crawlMeta.getUrl(),
src,
currentDepth + 1);
}

count++;
}
}

} finally { // 上一层爬完计数+1
fetchQueue.finishJob(crawlMeta, count, maxDepth);

long end = System.currentTimeMillis();
if (log.isDebugEnabled()) {
log.debug("url {} subUrl counts: {}, filter result cost: {}ms, currentDepth: {} \n\n",
crawlMeta.getUrl(),
count, end - start,
crawlMeta.getCurrentDepth());
}
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.util.EntityUtils;
Expand All @@ -24,6 +25,7 @@
/**
* Created by yihui on 2017/6/29.
*/
@Slf4j
@Getter
@Setter
@NoArgsConstructor
Expand Down Expand Up @@ -67,8 +69,23 @@ public DefaultAbstractCrawlJob(int depth) {
* 执行抓取网页
*/
void doFetchPage() throws Exception {
if (log.isDebugEnabled()) {
log.debug("start crawl url: {}", crawlMeta.getUrl());
}


long start = System.currentTimeMillis();
HttpResponse response = HttpUtils.request(this.crawlMeta, httpConf);
String res = EntityUtils.toString(response.getEntity(), httpConf.getCode());
long end = System.currentTimeMillis();
if (log.isDebugEnabled()) {
log.debug("crawl url:{} response code: {} cost time: {} ms\n",
this.crawlMeta.getUrl(),
response.getStatusLine().getStatusCode(),
end - start );
}


if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { // 请求成功
this.crawlResult = new CrawlResult();
this.crawlResult.setStatus(response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public void beforeRun() {

@Override
protected void visit(CrawlResult crawlResult) {
System.out.println(Thread.currentThread().getName() + "___" + crawlMeta.getCurrentDepth() + "___" + crawlResult.getUrl());
// System.out.println(Thread.currentThread().getName() + "___" + crawlMeta.getCurrentDepth() + "___" + crawlResult.getUrl());
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.quick.hui.crawler.core.test.base;

import lombok.extern.slf4j.Slf4j;
import org.junit.Test;

import java.util.regex.Matcher;
Expand All @@ -8,6 +9,7 @@
/**
* Created by yihui on 2017/6/29.
*/
@Slf4j
public class RegexTest {

@Test
Expand All @@ -24,7 +26,8 @@ public void testRegex() {
Pattern patter = Pattern.compile(regex);
for(String url: urls) {
Matcher matcher = patter.matcher(url);
System.out.println(url + " " + matcher.find());
// System.out.println(url + " " + matcher.find());
log.debug("url:{} match result:{}", url, matcher.find());
}
}

Expand Down
16 changes: 16 additions & 0 deletions core/src/test/resources/logback-test.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>

<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%-4relative [%thread] %-5level %logger{35} - %msg %n</pattern>
</encoder>
</appender>

<logger name="com.quick.hui.crawler" level="DEBUG"/>


<root level="INFO">
<appender-ref ref="STDOUT"/>
</root>
</configuration>

0 comments on commit 0732630

Please sign in to comment.