From f900c554e734208a4590430e57eaf9e1465ba805 Mon Sep 17 00:00:00 2001 From: yihui Date: Thu, 29 Jun 2017 23:36:02 +0800 Subject: [PATCH] =?UTF-8?q?v0.003=20=E5=AE=9E=E7=8E=B0=E6=B7=B1=E5=BA=A6?= =?UTF-8?q?=E7=88=AC=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 11 +- .../hui/crawler/core/entity/CrawlMeta.java | 42 ++++- .../hui/crawler/core/entity/CrawlResult.java | 2 +- .../hui/crawler/core/job/AbstractJob.java | 10 ++ .../core/job/DefaultAbstractCrawlJob.java | 157 ++++++++++++++++++ .../hui/crawler/core/job/SimpleCrawlJob.java | 66 ++------ .../hui/crawler/core/storage/IStorage.java | 26 +++ .../crawler/core/storage/StorageWrapper.java | 88 ++++++++++ .../crawler/core/storage/ram/RamStorage.java | 31 ++++ .../hui/crawler/core/utils/HttpUtils.java | 12 +- .../crawler/core/test/BaseCrawlerTest.java | 71 ++++++++ .../hui/crawler/core/test/JsoupTest.java | 33 ++++ .../hui/crawler/core/test/base/LockTest.java | 58 +++++++ .../hui/crawler/core/test/base/RegexTest.java | 31 ++++ 14 files changed, 575 insertions(+), 63 deletions(-) create mode 100644 core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java create mode 100644 core/src/main/java/com/quick/hui/crawler/core/storage/IStorage.java create mode 100644 core/src/main/java/com/quick/hui/crawler/core/storage/StorageWrapper.java create mode 100644 core/src/main/java/com/quick/hui/crawler/core/storage/ram/RamStorage.java create mode 100644 core/src/test/java/com/quick/hui/crawler/core/test/JsoupTest.java create mode 100644 core/src/test/java/com/quick/hui/crawler/core/test/base/LockTest.java create mode 100644 core/src/test/java/com/quick/hui/crawler/core/test/base/RegexTest.java diff --git a/README.md b/README.md index 9fb3273..9353bdc 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,15 @@ > 实现了一个最简单,最基础的爬虫, 处于能用的阶段 -### 2. [v.0.002](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.002) +### 2. [v0.002](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.002) > 利用HttpClient来替代jdk的http请求;新增http参数配置 + + +### 3. [v0.003](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.003) + +> 实现深度爬网页 + +- 支持正向、逆向链接过滤 +- 在内存中保存爬取记录,用于去重过滤 +- 提供爬取完成后的回调方法,用于结果处理 \ No newline at end of file diff --git a/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java index 1e2d2ba..5dd066e 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java +++ b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlMeta.java @@ -1,17 +1,18 @@ package com.quick.hui.crawler.core.entity; -import lombok.Getter; -import lombok.Setter; -import lombok.ToString; +import lombok.*; import java.util.HashSet; import java.util.Set; +import java.util.regex.Pattern; /** * Created by yihui on 2017/6/27. */ @ToString +@NoArgsConstructor +@AllArgsConstructor public class CrawlMeta { /** @@ -26,12 +27,41 @@ public class CrawlMeta { * 获取指定内容的规则, 因为一个网页中,你可能获取多个不同的内容, 所以放在集合中 */ @Setter - private Set selectorRules; + @Getter + private Set selectorRules = new HashSet<>(); + + + /** + * 正向的过滤规则 + */ + @Setter + @Getter + private Set positiveRegex = new HashSet<>(); + + + /** + * 逆向的过滤规则 + */ + @Setter + @Getter + private Set negativeRegex = new HashSet<>(); - public Set getSelectorRules() { - return selectorRules != null ? selectorRules : new HashSet<>(); + public Set addSelectorRule(String rule) { + this.selectorRules.add(rule); + return selectorRules; } + + public Set addPositiveRegex(String regex) { + this.positiveRegex.add(Pattern.compile(regex)); + return this.positiveRegex; + } + + + public Set addNegativeRegex(String regex) { + this.negativeRegex.add(Pattern.compile(regex)); + return this.negativeRegex; + } } diff --git a/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlResult.java b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlResult.java index f9572d4..d928b8a 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlResult.java +++ b/core/src/main/java/com/quick/hui/crawler/core/entity/CrawlResult.java @@ -14,7 +14,7 @@ */ @Getter @Setter -@ToString(callSuper = true) +@ToString public class CrawlResult { diff --git a/core/src/main/java/com/quick/hui/crawler/core/job/AbstractJob.java b/core/src/main/java/com/quick/hui/crawler/core/job/AbstractJob.java index 6b7d83b..1bedfa1 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/job/AbstractJob.java +++ b/core/src/main/java/com/quick/hui/crawler/core/job/AbstractJob.java @@ -1,5 +1,7 @@ package com.quick.hui.crawler.core.job; +import com.quick.hui.crawler.core.entity.CrawlResult; + /** * Created by yihui on 2017/6/27. */ @@ -12,6 +14,14 @@ public void afterRun() { } + /** + * 解析完网页后的回调方法 + * + * @param crawlResult + */ + protected abstract void visit(CrawlResult crawlResult); + + @Override public void run() { this.beforeRun(); diff --git a/core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java b/core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java new file mode 100644 index 0000000..7dbec66 --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/job/DefaultAbstractCrawlJob.java @@ -0,0 +1,157 @@ +package com.quick.hui.crawler.core.job; + +import com.quick.hui.crawler.core.entity.CrawlHttpConf; +import com.quick.hui.crawler.core.entity.CrawlMeta; +import com.quick.hui.crawler.core.entity.CrawlResult; +import com.quick.hui.crawler.core.storage.StorageWrapper; +import com.quick.hui.crawler.core.utils.HttpUtils; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; +import org.apache.http.HttpResponse; +import org.apache.http.HttpStatus; +import org.apache.http.util.EntityUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Created by yihui on 2017/6/29. + */ +@Getter +@Setter +@NoArgsConstructor +public abstract class DefaultAbstractCrawlJob extends AbstractJob { + /** + * 配置项信息 + */ + private CrawlMeta crawlMeta; + + + /** + * http配置信息 + */ + private CrawlHttpConf httpConf = new CrawlHttpConf(); + + + /** + * 爬网页的深度, 默认为0, 即只爬取当前网页 + */ + protected int depth = 0; + + + public DefaultAbstractCrawlJob(int depth) { + this.depth = depth; + } + + + /** + * 执行抓取网页 + */ + public void doFetchPage() throws Exception { + doFetchNextPage(0, this.crawlMeta.getUrl()); + } + + + // fixme 非线程安全 + private void doFetchNextPage(int currentDepth, String url) throws Exception { + CrawlResult result = null; + try { + // 判断是否爬过;未爬取,则上锁并继续爬取网页 + if (StorageWrapper.getInstance().ifUrlFetched(url)) { + return; + } + + CrawlMeta subMeta = new CrawlMeta(url, this.crawlMeta.getSelectorRules(), this.crawlMeta.getPositiveRegex(), this.crawlMeta.getNegativeRegex()); + HttpResponse response = HttpUtils.request(subMeta, httpConf); + String res = EntityUtils.toString(response.getEntity()); + if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { // 请求成功 + result = new CrawlResult(); + result.setStatus(response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase()); + result.setUrl(crawlMeta.getUrl()); + this.visit(result); + return; + } + + + // 网页解析 + result = doParse(res, subMeta); + } finally { + // 添加一条记录, 并释放锁 + StorageWrapper.getInstance().addFetchRecord(url, result); + } + + // 回调用户的网页内容解析方法 + this.visit(result); + + + // 超过最大深度, 不继续爬 + if (currentDepth > depth) { + return; + } + + + Elements elements = result.getHtmlDoc().select("a[href]"); + String src; + for(Element element: elements) { + // 确保将相对地址转为绝对地址 + src = element.attr("abs:href"); + if (matchRegex(src)) { + doFetchNextPage(currentDepth + 1, src); + } + } + } + + + private CrawlResult doParse(String html, CrawlMeta meta) { + Document doc = Jsoup.parse(html, meta.getUrl()); + + Map> map = new HashMap<>(meta.getSelectorRules().size()); + for (String rule : crawlMeta.getSelectorRules()) { + List list = new ArrayList<>(); + for (Element element : doc.select(rule)) { + list.add(element.text()); + } + + map.put(rule, list); + } + + + CrawlResult result = new CrawlResult(); + result.setHtmlDoc(doc); + result.setUrl(meta.getUrl()); + result.setResult(map); + result.setStatus(CrawlResult.SUCCESS); + return result; + } + + + private boolean matchRegex(String url) { + Matcher matcher; + for(Pattern pattern: crawlMeta.getPositiveRegex()) { + matcher = pattern.matcher(url); + if (matcher.find()) { + return true; + } + } + + + for(Pattern pattern: crawlMeta.getNegativeRegex()) { + matcher = pattern.matcher(url); + if(matcher.find()) { + return false; + } + } + + + return crawlMeta.getPositiveRegex().size() == 0; + } +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/job/SimpleCrawlJob.java b/core/src/main/java/com/quick/hui/crawler/core/job/SimpleCrawlJob.java index 0bbe8cf..3163364 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/job/SimpleCrawlJob.java +++ b/core/src/main/java/com/quick/hui/crawler/core/job/SimpleCrawlJob.java @@ -1,21 +1,12 @@ package com.quick.hui.crawler.core.job; -import com.quick.hui.crawler.core.entity.CrawlHttpConf; -import com.quick.hui.crawler.core.entity.CrawlMeta; import com.quick.hui.crawler.core.entity.CrawlResult; -import com.quick.hui.crawler.core.utils.HttpUtils; import lombok.Getter; +import lombok.NoArgsConstructor; import lombok.Setter; -import org.apache.http.HttpResponse; -import org.apache.http.util.EntityUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; /** * 最简单的一个爬虫任务 @@ -24,61 +15,38 @@ */ @Getter @Setter -public class SimpleCrawlJob extends AbstractJob { +@NoArgsConstructor +public class SimpleCrawlJob extends DefaultAbstractCrawlJob { /** - * 配置项信息 + * 存储爬取的结果 */ - private CrawlMeta crawlMeta; + private CrawlResult crawlResult; /** - * http配置信息 + * 批量查询的结果 */ - private CrawlHttpConf httpConf = new CrawlHttpConf(); - + private List crawlResults = new ArrayList<>(); - /** - * 存储爬取的结果 - */ - private CrawlResult crawlResult = new CrawlResult(); - /** - * 执行抓取网页 - */ - public void doFetchPage() throws Exception { - HttpResponse response = HttpUtils.request(crawlMeta, httpConf); - String res = EntityUtils.toString(response.getEntity()); - if (response.getStatusLine().getStatusCode() == 200) { // 请求成功 - doParse(res); - } else { - this.crawlResult = new CrawlResult(); - this.crawlResult.setStatus(response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase()); - this.crawlResult.setUrl(crawlMeta.getUrl()); - } + public SimpleCrawlJob(int depth) { + super(depth); } + @Override + protected void visit(CrawlResult crawlResult) { + crawlResults.add(crawlResult); + } - private void doParse(String html) { - Document doc = Jsoup.parse(html); - - Map> map = new HashMap<>(crawlMeta.getSelectorRules().size()); - for (String rule: crawlMeta.getSelectorRules()) { - List list = new ArrayList<>(); - for (Element element: doc.select(rule)) { - list.add(element.text()); - } - map.put(rule, list); + public CrawlResult getCrawlResult() { + if(crawlResults.size() == 0) { + return null; } - - this.crawlResult = new CrawlResult(); - this.crawlResult.setHtmlDoc(doc); - this.crawlResult.setUrl(crawlMeta.getUrl()); - this.crawlResult.setResult(map); - this.crawlResult.setStatus(CrawlResult.SUCCESS); + return crawlResults.get(0); } } diff --git a/core/src/main/java/com/quick/hui/crawler/core/storage/IStorage.java b/core/src/main/java/com/quick/hui/crawler/core/storage/IStorage.java new file mode 100644 index 0000000..ed7e51a --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/storage/IStorage.java @@ -0,0 +1,26 @@ +package com.quick.hui.crawler.core.storage; + +import com.quick.hui.crawler.core.entity.CrawlResult; + +/** + * Created by yihui on 2017/6/29. + */ +public interface IStorage { + + /** + * 若爬取的URL不在storage中, 则写入; 否则忽略 + * + * @param url 爬取的网址 + * @return true 表示写入成功, 即之前没有这条记录; false 则表示之前已经有记录了 + */ + boolean putIfNotExist(String url, CrawlResult result); + + + /** + * 判断是否存在 + * @param url + * @return + */ + boolean contains(String url); + +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/storage/StorageWrapper.java b/core/src/main/java/com/quick/hui/crawler/core/storage/StorageWrapper.java new file mode 100644 index 0000000..2876ea8 --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/storage/StorageWrapper.java @@ -0,0 +1,88 @@ +package com.quick.hui.crawler.core.storage; + +import com.quick.hui.crawler.core.entity.CrawlResult; +import com.quick.hui.crawler.core.storage.ram.RamStorage; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +/** + * Created by yihui on 2017/6/29. + */ +public class StorageWrapper { + + private static StorageWrapper instance = new StorageWrapper(); + + + private IStorage storage; + + private Map lockMap = new ConcurrentHashMap<>(); + + public static StorageWrapper getInstance() { + return instance; + } + + + private StorageWrapper() { + storage = new RamStorage(); + } + + + /** + * 判断url是否被爬取过; 是则返回true; 否这返回false, 并上锁 + * + * @param url + * @return + */ + public boolean ifUrlFetched(String url) { + if(storage.contains(url)) { + return true; + } + + synchronized (this) { + if (!lockMap.containsKey(url)) { + // 不存在时,加一个锁 + lockMap.put(url, new ReentrantLock()); + } + + this.lock(url); + + + if (storage.contains(url)) { + return true; + } +// System.out.println(Thread.currentThread() + " lock url: " + url); + return false; + } + } + + + /** + * 爬完之后, 新增一条爬取记录 + * @param url + * @param crawlResult + */ + public void addFetchRecord(String url, CrawlResult crawlResult) { + try { + if (crawlResult != null) { + storage.putIfNotExist(url, crawlResult); + this.unlock(url); + } + } catch (Exception e) { + System.out.println(Thread.currentThread().getName() + " result: " + url + " e: " + e); + } + } + + + + private void lock(String url) { + lockMap.get(url).lock(); + } + + + private void unlock(String url) { + lockMap.get(url).unlock(); + } +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/storage/ram/RamStorage.java b/core/src/main/java/com/quick/hui/crawler/core/storage/ram/RamStorage.java new file mode 100644 index 0000000..c377d43 --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/storage/ram/RamStorage.java @@ -0,0 +1,31 @@ +package com.quick.hui.crawler.core.storage.ram; + +import com.quick.hui.crawler.core.entity.CrawlResult; +import com.quick.hui.crawler.core.storage.IStorage; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Created by yihui on 2017/6/29. + */ +public class RamStorage implements IStorage { + + private Map map = new ConcurrentHashMap<>(); + + + @Override + public boolean putIfNotExist(String url, CrawlResult result) { + if(map.containsKey(url)) { + return false; + } + + map.put(url, result); + return true; + } + + @Override + public boolean contains(String url) { + return map.containsKey(url); + } +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/utils/HttpUtils.java b/core/src/main/java/com/quick/hui/crawler/core/utils/HttpUtils.java index 124e4ab..f140a9d 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/utils/HttpUtils.java +++ b/core/src/main/java/com/quick/hui/crawler/core/utils/HttpUtils.java @@ -25,16 +25,16 @@ public class HttpUtils { public static HttpResponse request(CrawlMeta crawlMeta, CrawlHttpConf httpConf) throws Exception { switch (httpConf.getMethod()) { case GET: - return doGet(crawlMeta, httpConf); + return doGet(crawlMeta.getUrl(), httpConf); case POST: - return doPost(crawlMeta, httpConf); + return doPost(crawlMeta.getUrl(), httpConf); default: return null; } } - private static HttpResponse doGet(CrawlMeta crawlMeta, CrawlHttpConf httpConf) throws Exception { + private static HttpResponse doGet(String url, CrawlHttpConf httpConf) throws Exception { // HttpClient httpClient = HttpClients.createDefault(); SSLContextBuilder builder = new SSLContextBuilder(); // 全部信任 不做身份鉴定 @@ -42,7 +42,7 @@ private static HttpResponse doGet(CrawlMeta crawlMeta, CrawlHttpConf httpConf) t HttpClient httpClient = HttpClientBuilder.create().setSslcontext(builder.build()).build(); // 设置请求参数 - StringBuilder param = new StringBuilder(crawlMeta.getUrl()).append("?"); + StringBuilder param = new StringBuilder(url).append("?"); for (Map.Entry entry : httpConf.getRequestParams().entrySet()) { param.append(entry.getKey()) .append("=") @@ -63,14 +63,14 @@ private static HttpResponse doGet(CrawlMeta crawlMeta, CrawlHttpConf httpConf) t } - private static HttpResponse doPost(CrawlMeta crawlMeta, CrawlHttpConf httpConf) throws Exception { + private static HttpResponse doPost(String url, CrawlHttpConf httpConf) throws Exception { // HttpClient httpClient = HttpClients.createDefault(); SSLContextBuilder builder = new SSLContextBuilder(); // 全部信任 不做身份鉴定 builder.loadTrustMaterial(null, (x509Certificates, s) -> true); HttpClient httpClient = HttpClientBuilder.create().setSslcontext(builder.build()).build(); - HttpPost httpPost = new HttpPost(crawlMeta.getUrl()); + HttpPost httpPost = new HttpPost(url); // 建立一个NameValuePair数组,用于存储欲传送的参数 diff --git a/core/src/test/java/com/quick/hui/crawler/core/test/BaseCrawlerTest.java b/core/src/test/java/com/quick/hui/crawler/core/test/BaseCrawlerTest.java index 23dfa82..c54c879 100644 --- a/core/src/test/java/com/quick/hui/crawler/core/test/BaseCrawlerTest.java +++ b/core/src/test/java/com/quick/hui/crawler/core/test/BaseCrawlerTest.java @@ -2,10 +2,12 @@ import com.quick.hui.crawler.core.entity.CrawlMeta; import com.quick.hui.crawler.core.entity.CrawlResult; +import com.quick.hui.crawler.core.job.DefaultAbstractCrawlJob; import com.quick.hui.crawler.core.job.SimpleCrawlJob; import org.junit.Test; import java.util.HashSet; +import java.util.List; import java.util.Set; /** @@ -43,4 +45,73 @@ public void testFetch() throws InterruptedException { System.out.println(result); } + + /** + * 深度爬 + * @throws InterruptedException + */ + @Test + public void testDepthFetch() throws InterruptedException { + String url = "http://chengyu.911cha.com/zishu_3_p1.html"; + CrawlMeta crawlMeta = new CrawlMeta(); + crawlMeta.setUrl(url); + crawlMeta.addPositiveRegex("http://chengyu.911cha.com/zishu_3_p([0-9]+).html"); + + + SimpleCrawlJob job = new SimpleCrawlJob(1); + job.setCrawlMeta(crawlMeta); + Thread thread = new Thread(job, "crawlerDepth-test"); + thread.start(); + + + thread.join(); + List result = job.getCrawlResults(); + System.out.println(result); + } + + + + @Test + public void testSelfCwralFetch() throws InterruptedException { + String url = "http://chengyu.t086.com/gushi/1.htm"; + CrawlMeta crawlMeta = new CrawlMeta(); + crawlMeta.setUrl(url); + crawlMeta.addPositiveRegex("http://chengyu.t086.com/gushi/[0-9]+\\.htm$"); + + + DefaultAbstractCrawlJob job = new DefaultAbstractCrawlJob(1) { + @Override + protected void visit(CrawlResult crawlResult) { + System.out.println("job1 >>> " + crawlResult.getUrl()); + } + }; + job.setCrawlMeta(crawlMeta); + + + + String url2 = "http://chengyu.t086.com/gushi/2.htm"; + CrawlMeta crawlMeta2 = new CrawlMeta(); + crawlMeta2.setUrl(url2); + crawlMeta2.addPositiveRegex("http://chengyu.t086.com/gushi/[0-9]+\\.htm$"); + DefaultAbstractCrawlJob job2 = new DefaultAbstractCrawlJob(1) { + @Override + protected void visit(CrawlResult crawlResult) { + System.out.println("job2 >>> " + crawlResult.getUrl()); + } + }; + job2.setCrawlMeta(crawlMeta2); + + + + Thread thread = new Thread(job, "crawlerDepth-test"); + Thread thread2 = new Thread(job2, "crawlerDepth-test2"); + thread.start(); + thread2.start(); + + + thread.join(); + thread2.join(); + } + + } diff --git a/core/src/test/java/com/quick/hui/crawler/core/test/JsoupTest.java b/core/src/test/java/com/quick/hui/crawler/core/test/JsoupTest.java new file mode 100644 index 0000000..0ba0810 --- /dev/null +++ b/core/src/test/java/com/quick/hui/crawler/core/test/JsoupTest.java @@ -0,0 +1,33 @@ +package com.quick.hui.crawler.core.test; + +import org.jsoup.Connection; +import org.jsoup.helper.HttpConnection; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; +import org.junit.Test; + +import java.io.IOException; + +/** + * Created by yihui on 2017/6/29. + */ +public class JsoupTest { + + + // 获取网页中的所有链接 + @Test + public void testGetLink() throws IOException { + String url = "http://chengyu.911cha.com/zishu_3_p1.html"; + + Connection httpConnection = HttpConnection.connect(url) + .header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") + .header("connection", "Keep-Alive") + .header("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"); + + Document doc = httpConnection.get(); + Elements links = doc.select("a[href]"); + + String str = links.get(181).attr("abs:href"); + System.out.println(str); + } +} diff --git a/core/src/test/java/com/quick/hui/crawler/core/test/base/LockTest.java b/core/src/test/java/com/quick/hui/crawler/core/test/base/LockTest.java new file mode 100644 index 0000000..d05bb37 --- /dev/null +++ b/core/src/test/java/com/quick/hui/crawler/core/test/base/LockTest.java @@ -0,0 +1,58 @@ +package com.quick.hui.crawler.core.test.base; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +/** + * Created by yihui on 2017/6/29. + */ +public class LockTest { + + private static List lock = new ArrayList<>(); + static { + lock.add(new ReentrantLock()); + lock.add(new ReentrantLock()); + } + + private static int i = 0; + + public static class MyThread extends Thread { + private int index; + + public MyThread(String str, int index) { + super(str); + this.index = index % 2; + } + + + public void run() { + lock.get(index).lock(); + System.out.println(Thread.currentThread().getName() + " 上锁"); + try { + i++; + Thread.sleep(20); + } catch (InterruptedException e) { + e.printStackTrace(); + } finally { + System.out.println(Thread.currentThread().getName() + " 释放锁"); + lock.get(index).unlock(); + } + } + } + + + public static void main(String[] args) throws InterruptedException { + for(int t =0 ;t <100; t++) { + MyThread thread1 = new MyThread("test1+" + t, t); + MyThread thread2 = new MyThread("test2+" + t, t); + MyThread thread3 = new MyThread("test3+" + t, t); + + thread1.start(); + thread2.start(); + thread3.start(); + + } + } +} diff --git a/core/src/test/java/com/quick/hui/crawler/core/test/base/RegexTest.java b/core/src/test/java/com/quick/hui/crawler/core/test/base/RegexTest.java new file mode 100644 index 0000000..48e0ee9 --- /dev/null +++ b/core/src/test/java/com/quick/hui/crawler/core/test/base/RegexTest.java @@ -0,0 +1,31 @@ +package com.quick.hui.crawler.core.test.base; + +import org.junit.Test; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Created by yihui on 2017/6/29. + */ +public class RegexTest { + + @Test + public void testRegex() { + + String regex = "https://my.oschina.net/u/566591/blog(.*)"; + String[] urls = new String[] { + "https://my.oschina.net/u/566591/blog?search=java", + "https://my.oschina.net/u/566592/blog?search=java", + "https://my.oschina.net/u/566591/blog/12342" + }; + + + Pattern patter = Pattern.compile(regex); + for(String url: urls) { + Matcher matcher = patter.matcher(url); + System.out.println(url + " " + matcher.find()); + } + } + +}