-
Notifications
You must be signed in to change notification settings - Fork 1k
5 使用代理
Lewis Zou edited this page Feb 18, 2021
·
5 revisions
此接口用于反回可用代理,一般代理供应商是会提供 API 用于返回代理信息的。可在源码中参考:KuaidailiProxySupplier 或者 FiddlerProxySupplier
public class ProxySpider : Spider
{
public ProxySpider(IOptions<SpiderOptions> options, DependenceServices services,
ILogger<Spider> logger) : base(
options, services, logger)
{
}
protected override async Task InitializeAsync(CancellationToken stoppingToken = default)
{
AddDataFlow(new DataParser<CnblogsEntry>());
AddDataFlow(GetDefaultStorage());
await AddRequestsAsync(
new Request(
"https://news.cnblogs.com/n/page/1", new Dictionary<string, object> {{"网站", "博客园"}}));
}
protected override SpiderId GenerateSpiderId()
{
return new(ObjectId.CreateId().ToString(), "博客园");
}
[Schema("cnblogs", "news")]
[EntitySelector(Expression = ".//div[@class='news_block']", Type = SelectorType.XPath)]
[GlobalValueSelector(Expression = ".//a[@class='current']", Name = "类别", Type = SelectorType.XPath)]
[GlobalValueSelector(Expression = "//title", Name = "Title", Type = SelectorType.XPath)]
[FollowRequestSelector(Expressions = new[] {"//div[@class='pager']"})]
class CnblogsEntry : EntityBase<CnblogsEntry>
{
protected override void Configure()
{
HasIndex(x => x.Title);
HasIndex(x => new {x.WebSite, x.Guid}, true);
}
public int Id { get; set; }
[Required]
[StringLength(200)]
[ValueSelector(Expression = "类别", Type = SelectorType.Environment)]
public string Category { get; set; }
[Required]
[StringLength(200)]
[ValueSelector(Expression = "网站", Type = SelectorType.Environment)]
public string WebSite { get; set; }
[StringLength(200)]
[ValueSelector(Expression = "Title", Type = SelectorType.Environment)]
[ReplaceFormatter(NewValue = "", OldValue = " - 博客园")]
public string Title { get; set; }
[StringLength(40)]
[ValueSelector(Expression = "GUID", Type = SelectorType.Environment)]
public string Guid { get; set; }
[ValueSelector(Expression = ".//h2[@class='news_entry']/a")]
public string News { get; set; }
[ValueSelector(Expression = ".//h2[@class='news_entry']/a/@href")]
public string Url { get; set; }
[ValueSelector(Expression = ".//div[@class='entry_summary']")]
[TrimFormatter]
public string PlainText { get; set; }
[ValueSelector(Expression = "DATETIME", Type = SelectorType.Environment)]
public DateTime CreationTime { get; set; }
}
}
var builder = Builder.CreateDefaultBuilder<ProxySpider>(options =>
{
options.Speed = 1;
});
builder.UseDownloader<HttpClientDownloader>();
builder.UseSerilog();
// Open fiddler and make sure the listen port is 8866
builder.UseProxy<FiddlerProxySupplier, DefaultProxyValidator>(x =>
{
x.ProxyTestUrl = "http://localhost:8866";
});
builder.IgnoreServerCertificateError();
builder.UseQueueDistinctBfsScheduler<HashSetDuplicateRemover>();
await builder.Build().RunAsync();
- 配置 FiddlerProxySupplier 和默认的代理测试器 DefaultProxyValidator。
- 配置代理测试是否成功的链接为 ProxyTestUrl,其端口为 Fiddler 的监听端口。
- 开启 Fiddler 作为代理测试
- 运行爬虫结果:
[21:18:31 INF]
_____ _ _ _____ _ _
| __ \ | | | | / ____| (_) | |
| | | | ___ | |_ _ __ ___| |_| (___ _ __ _ __| | ___ _ __
| | | |/ _ \| __| '_ \ / _ \ __|\___ \| '_ \| |/ _` |/ _ \ '__|
| |__| | (_) | |_| | | | __/ |_ ____) | |_) | | (_| | __/ |
|_____/ \___/ \__|_| |_|\___|\__|_____/| .__/|_|\__,_|\___|_| version: 5.0.8.0
| |
|_|
[21:18:31 INF] RequestedQueueCount: 1000
[21:18:31 INF] Depth: 0
[21:18:31 INF] RetriedTimes: 3
[21:18:31 INF] EmptySleepTime: 60
[21:18:31 INF] Speed: 1
[21:18:31 INF] Batch: 4
[21:18:31 INF] RemoveOutboundLinks: False
[21:18:31 INF] StorageType: DotnetSpider.MySql.MySqlEntityStorage, DotnetSpider.MySql
[21:18:31 INF] RefreshProxy: 30
[21:18:31 INF] Agent is starting
[21:18:32 INF] Agent started
[21:18:32 INF] Initialize spider 602e6928e146bd937eaf6d5f, 博客园
[21:18:32 INF] 602e6928e146bd937eaf6d5f DataFlows: DataParser`1 -> MySqlEntityStorage
[21:18:32 INF] 602e6928e146bd937eaf6d5f register topic DotnetSpider_602e6928e146bd937eaf6d5f
[21:18:32 INF] Statistics service starting
[21:18:32 INF] Statistics service started
[21:18:32 INF] Proxy http://localhost:8866/ is available
[21:18:32 INF] Find new 1 proxies
[21:18:34 INF] 602e6928e146bd937eaf6d5f download https://news.cnblogs.com/n/page/1, swErJw== completed
[21:18:35 INF] 602e6928e146bd937eaf6d5f download https://news.cnblogs.com/, WLsPow== completed
[21:18:36 INF] 602e6928e146bd937eaf6d5f download https://news.cnblogs.com/n/page/2/, 1blkyQ== completed
[21:18:37 INF] 602e6928e146bd937eaf6d5f total 11, speed: 0.65, success 3, failure 0, left 8
[21:18:37 INF] 602e6928e146bd937eaf6d5f download https://news.cnblogs.com/n/page/3/, gGhfFA== completed
可以看到两条关键信息:
[21:18:32 INF] Proxy http://localhost:8866/ is available
[21:18:32 INF] Find new 1 proxies
我们也可以从 Fiddler 监控中看到代理启作用了
ProxyService 是一个后台服务,每隔 30 秒会调用注册的 IProxySupplier 获取可用代理,若是未注册 IProxySupplier 则此后台服务会直接退出。获取到的代理并不会立即进入代理池,而是需要经过检测,检测的接口是 IProxyValidator,框架有一个默认实现 DefaultProxyValidator,其通过对 HttpClientFactory 的改造,使得测试相同的代理时不会产生额外的端口开销。使用时可以通过配置 ProxyTestUri 来指定测试地址,比如设定为被采集的地止。
并发控制器把请求推送到消息队列前会判断是否配置使用代理,若要求使用代理,则会从 ProxyPool 中尝试获取一个可用代理(默认阻塞 70 秒尝试获取,用户可以自己设定),若是无可用代理,爬虫则会退出。分布成功代理后,则会把代理信息添加到请求中,当下载代理器获取到请求后,通过请求中配置的代理信息来确认是否需要使用代理。