-
Notifications
You must be signed in to change notification settings - Fork 3
/
zhihu.py
127 lines (111 loc) · 5.14 KB
/
zhihu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# zhihu.py
# coding:utf-8
import urllib
import urllib2
import re
import thread
import HTMLParser
import sys
from bs4 import BeautifulSoup
from crawler import Crawler
# 解决UnicodeEncodeError: 'ascii' codec can't encode characte
reload(sys)
sys.setdefaultencoding("utf-8")
class ZhiHu:
#构造函数,在类初始化的时候调用
def __init__(self):
#存放程序是否继续运行的变量
self.enable = False
# 初始化问题
self.question = ''
self.user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
#初始化headers
self.headers = { 'User-Agent' : self.user_agent }
def getAnswer(self, input):
self.question = input
pageCode = self.getPage()
# 使用Beautiful soup格式化pageCode,并写入到formatPage.html,方便查看
formatPage = pageCode.prettify()
formatFo = open('formatPage.html', "wb")
formatFo.write(formatPage)
formatFo.close()
pageCode = str(pageCode)
patterCard = re.compile(r'<div class="Card" data-reactid="\d+">(.*?)</div><div class="SearchSideBar"')
cardContent = patterCard.findall(pageCode)[0]
# 将pageCode写进文件(无格式化)
fo = open('page.html', "wb")
fo.write(cardContent)
fo.close()
# 生成每一条显示的list
def generateList(lists):
# 这里必须声明content是使用的全局变量中的content
global content
for match in lists:
url = match[0]
if(url.find('zhuanlan.zhihu.com')>0):
url = 'https:' + url
else:
url = 'https://www.zhihu.com' + url
title = match[1].decode('utf-8')
# 删除<em></em>
title = re.sub(r'<em>|</em>', "", title)
vote = match[2]
comment = match[3]
# 生成一个Crawler对象
itemCotent = Crawler(title, vote, comment, url)
content = content + itemCotent.toString()
# 最后返回的值
global content
content = ''
# 获取搜索结果列表
# 问答:
patternAnswerItem = re.compile(r'<div class="List-item" data-reactid="\d*"><div class="ContentItem AnswerItem".*?<a data-reactid="\d+" href="(/question/\d+/answer/\d+)".*?<span class="Highlight" data-reactid="[0-9]*">(.*?)</span>.*?<button aria-label="赞同" class="Button VoteButton VoteButton--up".*?><svg.*?</svg><!-- react-text: \d+ -->(.*?)<!-- /react-text -->.*?</button>.*?<button class="Button ContentItem-action Button--plain Button--withIcon Button--withLabel".*?</svg></span><!-- react-text: \d+ -->(.*?)<!-- /react-text --></button>')
# 找出所有匹配到的问答
listItems = patternAnswerItem.findall(cardContent, re.M)
# print 'listItems:', listItems
if(len(listItems)>0):
content = '问答:\n'
generateList(listItems)
# 专栏:
# patternArticle = re.compile(r'<div class="List-item" data-reactid="\d*"><div class="ContentItem ArticleItem".*?<a data-reactid="\d+" href="(//zhuanlan.zhihu.com/p/\d+)".*?<span class="Highlight" data-reactid="[0-9]*">(.*?)</span>.*?<button class="Button LikeButton ContentItem-action".*?<svg.*?</svg><!-- react-text: \d+ -->(\d*?)<!-- /react-text --></button><button class="Button ContentItem-action Button--plain Button--withIcon Button--withLabel".*?</svg></span><!-- react-text: \d+ -->(.*?)<!-- /react-text --></button>')
patternArticle = re.compile(r'<div class="List-item" data-reactid="\d*"><div class="ContentItem ArticleItem".*?<a data-reactid="\d+" href="(//zhuanlan.zhihu.com/p/\d+)".*?<span class="Highlight" data-reactid="[0-9]*">(.*?)</span>.*?<button class="Button LikeButton ContentItem-action".*?<svg.*?</svg><!-- react-text: \d+ -->(.*?)<!-- /react-text -->.*?</button>.*?<button class="Button ContentItem-action Button--plain Button--withIcon Button--withLabel".*?</svg></span><!-- react-text: \d+ -->(.*?)<!-- /react-text --></button>')
# 找出所有匹配到的专栏
listArticles = patternArticle.findall(cardContent, re.M)
# print 'listArticles:', listArticles
if(len(listArticles)>0):
content = content + '专栏:\n'
generateList(listArticles)
print content
return content
#获取一次page从知乎
def getPage(self):
try:
url = 'https://www.zhihu.com/search?type=content&q='+urllib.quote(self.question)
# print url
print url
#构建请求的request
request = urllib2.Request(url,headers = self.headers)
#利用urlopen获取页面代码
response = urllib2.urlopen(request)
#将页面转化为UTF-8编码
html_parser = HTMLParser.HTMLParser()
pageCode = response.read().decode('utf-8')
pageCode = html_parser.unescape(pageCode)
# 使用BeautifulSoup格式化html
soup = BeautifulSoup(pageCode, 'lxml')
return soup
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"连接失败,错误原因",e.reason
return None
def start(self):
self.enable = True
while self.enable:
print 'please enter question which you want to know:'
input = raw_input()
if input == 'Q':
self.enable = False
else:
self.getAnswer(input)
# zhihu = ZhiHu()
# zhihu.start()