-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
95 lines (65 loc) · 2.84 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from urllib.request import urlopen
from urllib.parse import urljoin
from bs4 import BeautifulSoup as bs
from link_finder import getpage
from link_finder import getLinks
from crawler1 import *
class spider:
#Staic variables type
directory = ""
baseUrl = ""
domainName = ""
queueFile =""
crawledFile =""
queue = set()
crawled = set()
def __init__(self , directory , baseUrl , domainName=""):
spider.directory = directory
spider.baseUrl = baseUrl
spider.domainName = domainName
spider.queueFile = directory + '/' + 'queue.txt'
spider.crawledFile = directory + '/' + 'crawled.txt'
self.boot() # to initialise directory , queue files and crawled files for spiders
self.crawl('First spider' , spider.baseUrl)
def boot(self):
create_project_directory(spider.directory)
create_datafiles(spider.directory , spider.baseUrl)
spider.queue = file_to_set(spider.queueFile)
spider.crawled = file_to_set(spider.crawledFile)
def crawl(self ,thread_name , url):
if url not in spider.crawled:
print(thread_name + " now crawling " + url)
print(str(len(spider.queue)) + "links waiting to be crawled")
print(str(len(spider.crawled)) + "links have been crawled")
links = spider.gatherlinks(url , spider.baseUrl)
if links== None:
print("No links in current url" + url)
spider.add_to_queue(links)
spider.queue.remove(url)
spider.crawled.add(url)
spider.update_files()
def gatherlinks(url , baseUrl):
page = getpage(url)
if not page==None:
html = page.read()
if not html==None:
bsObj = bs(html , "lxml")
links = getLinks(bsObj , baseUrl)
return links
return None
return None
def add_to_queue(links):
if not links==None:
for link in links:
if link in spider.queue:
continue
if link in spider.crawled:
continue
if spider.domainName not in link:
continue
spider.queue.add(link)
def update_files():
set_to_file(spider.queue , spider.queueFile)
set_to_file(spider.crawled , spider.crawledFile)
"""if __name__ == "__main__":
spider1 = spider('crawlers' , "https://en.wikipedia.org")"""