-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
132 lines (114 loc) · 4.59 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# This file contains core classes for this simple web crawler.
# Author: Jiang Zhixiang
# Date: 06/18/2019
import requests
from urllib.parse import urljoin, urlparse # for join two urls and parse url
from bs4 import BeautifulSoup
class Spider():
def __init__(self, rooturl):
self.to_visit = []
self.visted = set([])
self.exterLink = set([])
rooturl = self.clean_url(rooturl)
self.parse_result = urlparse(rooturl)
print(self.parse_result)
# let domain include path, for some time only need path web page
self.domain = self.parse_result.netloc + self.parse_result.path
def crawl(self, target_url):
target_url = self.clean_url(target_url) # clean target_url
self.to_visit.append(target_url) # put target_url to to_visit list
i = 0
while len(self.to_visit) > 0:
url = self.to_visit.pop(0) # get next url, pop(0) is first item
i += 1
print("The spider %d is visiting: %s" % (i, url))
urls = self.parser_url(url) # parse the url and get all urls from one html page
self.visted.add(url) # add this visted url to visted list
# Add urls from the praser to to_visit lits
# When they are not visited or already in the to_vist list
for url in urls:
if url not in self.visted and url not in self.to_visit:
self.to_visit.append(url)
print("Ok, The spider has finished crawling the web at {url}".format(url=target_url))
# print(" save inter url to results.log")
with open("results.log", "w") as f:
for url in self.visted:
f.write(url + "\n")
# print(url)
# print("web url list exter link")
# for url in self.exterLink:
# print(url)
def parser_url(self, current_url):
'''
Parse the url and get all urls from one html page
'''
urls = []
#print("execute parse_url: " + current_url)
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 QIHU 360SE'
}
try:
r = requests.get(current_url, headers=headers) # with test, some web need headers
if (r.status_code != 200):
print("status_code: %d" % r.status_code)
html = r.text
soup = BeautifulSoup(html, 'lxml')
# ******
# can insert your parse fuction at here
# get all you need
#print(soup.get_text())
# ******
# return [x['href'] for x in parser.findAll('a') if x.has_attr('href')]
for link in soup('a'):
#print(link)
get_a_url = link.get("href")
#print(get_a_url)
newurl = urljoin(current_url, get_a_url) # append relative path to the root path
newurl = self.clean_url(newurl) # clean up url
if self.url_valid(newurl, self.domain):
urls.append(newurl) # append url to the return list
except requests.exceptions.ReadTimeout:
print('Timeout')
except requests.exceptions.ConnectionError:
print('Connection error')
except requests.exceptions.RequestException:
print('Error')
return urls
def parse_html(self, html):
# html2md orgin
# goose3 ver
pass
def url_valid(self, url, domain):
if url.startswith("http"):
if domain in url:
# maybe inter link
#print("maybe inter link: " + url)
return True
else:
# maybe exter link
#print("maybe exter link: " + url)
if url not in self.exterLink:
self.exterLink.add(url)
return False
return False
def clean_url(self, url):
'''
Clean up url by
- always start with "http://" or "https://"
- remove element jumping #
- remove last '/'
@input:
url : the url to be processed
@output:
url : the clean url
'''
# Deal with "http(s)://"
if url[0:4] != "http":
url = "http://" + url
# Deal with "#"
idx = url.find('#')
if idx != -1:
url = url[:idx]
# Deal with last "/"
url = url.rstrip('/')
return url