forked from Jack-Cherish/python-spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
biqukan.py
142 lines (119 loc) · 3.88 KB
/
biqukan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding:UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import collections
import re
import os
import time
import sys
import types
"""
类说明:下载《笔趣看》网小说: url:http://www.biqukan.com/
Parameters:
target - 《笔趣看》网指定的小说目录地址(string)
Returns:
无
Modify:
2017-05-06
"""
class download(object):
def __init__(self, target):
self.__target_url = target
self.__head = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19',}
"""
函数说明:获取下载链接
Parameters:
无
Returns:
novel_name + '.txt' - 保存的小说名(string)
numbers - 章节数(int)
download_dict - 保存章节名称和下载链接的字典(dict)
Modify:
2017-05-06
"""
def get_download_url(self):
charter = re.compile(u'[第弟](.+)章', re.IGNORECASE)
target_req = request.Request(url = self.__target_url, headers = self.__head)
target_response = request.urlopen(target_req)
target_html = target_response.read().decode('gbk','ignore')
listmain_soup = BeautifulSoup(target_html,'lxml')
chapters = listmain_soup.find_all('div',class_ = 'listmain')
download_soup = BeautifulSoup(str(chapters), 'lxml')
novel_name = str(download_soup.dl.dt).split("》")[0][5:]
flag_name = "《" + novel_name + "》" + "正文卷"
numbers = (len(download_soup.dl.contents) - 1) / 2 - 8
download_dict = collections.OrderedDict()
begin_flag = False
numbers = 1
for child in download_soup.dl.children:
if child != '\n':
if child.string == u"%s" % flag_name:
begin_flag = True
if begin_flag == True and child.a != None:
download_url = "http://www.biqukan.com" + child.a.get('href')
download_name = child.string
names = str(download_name).split('章')
name = charter.findall(names[0] + '章')
if name:
download_dict['第' + str(numbers) + '章 ' + names[1]] = download_url
numbers += 1
return novel_name + '.txt', numbers, download_dict
"""
函数说明:爬取文章内容
Parameters:
url - 下载连接(string)
Returns:
soup_text - 章节内容(string)
Modify:
2017-05-06
"""
def Downloader(self, url):
download_req = request.Request(url = url, headers = self.__head)
download_response = request.urlopen(download_req)
download_html = download_response.read().decode('gbk','ignore')
soup_texts = BeautifulSoup(download_html, 'lxml')
texts = soup_texts.find_all(id = 'content', class_ = 'showtxt')
soup_text = BeautifulSoup(str(texts), 'lxml').div.text.replace('\xa0','')
return soup_text
"""
函数说明:将爬取的文章内容写入文件
Parameters:
name - 章节名称(string)
path - 当前路径下,小说保存名称(string)
text - 章节内容(string)
Returns:
无
Modify:
2017-05-06
"""
def Writer(self, name, path, text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n\n')
for each in text:
if each == 'h':
write_flag = False
if write_flag == True and each != ' ':
f.write(each)
if write_flag == True and each == '\r':
f.write('\n')
f.write('\n\n')
if __name__ == "__main__":
print("\n\t\t欢迎使用《笔趣看》小说下载小工具\n\n\t\t作者:Jack-Cui\t时间:2017-05-06\n")
print("*************************************************************************")
#小说地址
target_url = str(input("请输入小说目录下载地址:\n"))
#实例化下载类
d = download(target = target_url)
name, numbers, url_dict = d.get_download_url()
if name in os.listdir():
os.remove(name)
index = 1
#下载中
print("《%s》下载中:" % name[:-4])
for key, value in url_dict.items():
d.Writer(key, name, d.Downloader(value))
sys.stdout.write("已下载:%.3f%%" % float(index/numbers) + '\r')
sys.stdout.flush()
index += 1
print("《%s》下载完成!" % name[:-4])