-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsite_crawler.py
179 lines (161 loc) · 7.78 KB
/
site_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# /usr/bin/env python
# -*- coding: utf-8 -*-
"""
:copyright: (c) 2014 by Vojtech Burian
:license: MIT, see LICENSE for more details.
"""
import ConfigParser
import time
import os
from requests.exceptions import HTTPError
import requests
from unittestzero import Assert
from selenium import webdriver
class TestSiteCrawler():
""" automated website link & image checking bot """
def setup_class(self):
# load crawler variables
config = ConfigParser.ConfigParser()
config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'site_crawler.cfg')
config.read(config_file)
test_vars = dict(config.defaults())
# set crawler configuration variables
self.default_implicit_wait = test_vars.get('default_implicit_wait')
self.base_url = test_vars.get('base_url')
self.acceptable_url_substrings = [item for item in test_vars.get('acceptable_url_substrings').split(';')]
self.invalid_chars = [item for item in test_vars.get('invalid_chars').split(';')]
self.ignore_url_substrings = [item for item in test_vars.get('ignore_url_substrings').split(';')]
self.image_time_delay = int(test_vars.get('image_time_delay'))
self.accept_certs = bool(test_vars.get('accept_ssl_certificates'))
http_auth_username = test_vars.get('http_auth_username')
http_auth_password = test_vars.get('http_auth_password')
if http_auth_username == '' or http_auth_password == '':
self.http_auth = None
else:
self.http_auth = (http_auth_username, http_auth_password)
# crawler state variables
self.links_visited = [self.base_url]
self.invalid_urls = []
self.error_links = []
self.images_not_loaded = []
# set browser capabilities
capabilities = {}
if self.accept_certs:
capabilities['acceptSslCerts'] = True
# start browser
self.driver = webdriver.Chrome(desired_capabilities=capabilities)
self.driver.get(self.base_url)
def teardown_class(self):
self.driver.quit()
def test_unleash_bot(self):
""" tests links on page """
self.check_links()
self.report_failures()
def check_links(self):
""" recursively checks links on websites and checks whether images have been loaded properly """
links_objects = self.driver.find_elements_by_tag_name('a')
valid_links_on_page = []
# collect valid links on page for testing
for link in links_objects:
url = link.get_attribute('href')
if url is not None and self.is_url_valid(url):
valid_links_on_page.append(link.get_attribute('href'))
# start testing each link with valid url
for link in valid_links_on_page:
if link not in self.links_visited:
self.links_visited.append(link)
# test link if request does not return invalid response
if self.is_link_response_ok(link):
print 'Visiting: ' + link
self.driver.get(link)
self.wait_for_page_to_load()
self.check_images()
# recursively crawl test links found on this page
self.check_links()
self.driver.back()
def report_failures(self):
""" makes assertion fail if there had been any kind of failures reported """
result_invalid_urls = 'No invalid URLs detected.\n'
result_invalid_images = 'All images were loaded.\n'
result_error_links = 'All links return correct status code.\n'
report_failure = False
if len(self.invalid_urls) > 0:
report_failure = True
result_invalid_urls = 'Invalid URLs detected:\n\n'
for item in self.invalid_urls:
result_invalid_urls += item[1] + ' (' + item[0] + ')\n'
if len(self.images_not_loaded) > 0:
report_failure = True
result_invalid_images = 'Following images were not loaded:\n\n'
for item in self.images_not_loaded:
result_invalid_images += item[1] + ' (' + item[0] + ')\n'
if len(self.error_links) > 0:
report_failure = True
result_error_links = 'Following links returned bad status codes:\n\n'
for item in self.error_links:
result_error_links += item[1] + ' (' + item[0] + ')\n'
result_message = result_invalid_urls + result_error_links + result_invalid_images
if report_failure:
Assert.fail(result_message)
def is_url_valid(self, url):
""" checks whether url is valid and whether browser should try to load it """
url_valid = True
# excludes urls with values not required for testing
for invalid_item in self.ignore_url_substrings:
if invalid_item in url:
url_valid = False
# excludes urls that do not contain acceptable substrings (links leading to different domains)
if url_valid:
url_acceptable = False
for substring in self.acceptable_url_substrings:
if substring in url:
url_acceptable = True
if not url_acceptable:
url_valid = False
# reports urls with invalid characters
if url_valid:
for item in self.invalid_chars:
if item in url and url not in self.invalid_urls:
self.invalid_urls.append([self.driver.title + ' - ' + self.driver.current_url, url])
url_valid = False
# reports empty urls with invalid characters
if url_valid and url == '':
if url not in self.invalid_urls:
self.invalid_urls.append([self.driver.title + ' - ' + self.driver.current_url, url])
url_valid = False
return url_valid
def is_link_response_ok(self, url):
""" checks if request to link does not return invalid error code """
response = requests.get(url, auth=self.http_auth, verify=(not self.accept_certs))
is_ok = True
try:
response.raise_for_status()
except HTTPError:
self.error_links.append([self.driver.title + ' - ' + self.driver.current_url, url])
is_ok = False
return is_ok
def wait_for_page_to_load(self):
""" waits for page to load properly; important mainly for checking images """
# time.sleep(self.image_time_delay)
self.driver.implicitly_wait(self.default_implicit_wait)
def check_images(self):
""" checks all images on the pages and verifies if they have been properly loaded;
if some images are not loaded yet, script waits for certain amount of time and then tries again """
images_not_loaded = self.check_images_are_loaded()
if len(images_not_loaded) != 0:
time.sleep(self.image_time_delay)
images_not_loaded = self.check_images_are_loaded()
if len(images_not_loaded) != 0:
self.images_not_loaded.extend(images_not_loaded)
def check_images_are_loaded(self):
""" checks all images on the pages and verifies if they have been properly loaded """
images_not_loaded = []
for image in self.driver.find_elements_by_tag_name('img'):
script = 'return arguments[0].complete && typeof arguments[0].naturalWidth' \
' != "undefined" && arguments[0].naturalWidth > 0'
image_loaded = bool(self.driver.execute_script(script, image))
if not image_loaded:
if image.get_attribute('src') is not None:
images_not_loaded.append(
[self.driver.title + ' - ' + self.driver.current_url, str(image.get_attribute('src'))])
return images_not_loaded