Skip to content

Commit

Permalink
Merge pull request #34 from uccser/release/1.4.1
Browse files Browse the repository at this point in the history
Release 1.4.1
  • Loading branch information
eAlasdair authored Jun 8, 2020
2 parents a48e5fc + 61ddbfe commit fd9bdda
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 9 deletions.
7 changes: 7 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ more details.
Changelog
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

1.4.1
------------------------------------------------------------------------------

- Add logic to delay Linkie requesting from a domain if it responds with code 429.
- Reorganise logging output so that the URL is printed last.
- Update dependencies.

1.4.0
------------------------------------------------------------------------------

Expand Down
66 changes: 58 additions & 8 deletions linkie/linkie.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
import yaml
import logging
import requests
import time
from multiprocessing.dummy import Pool as ThreadPool
from multiprocessing.dummy import Lock

# This isn't a perfect URL matcher, but should catch the large majority of URLs.
# This now matches URLs presented in the format defined in the CSU Writing Guide
Expand All @@ -18,17 +20,21 @@
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'
}
THREADS = 12
TIMEOUT = 60 #s


class Linkie:

def __init__(self, config=None, config_file_path=None):
self.file_count = 0
self.status_counts = {}
self.urls = dict()
self.unchecked_urls = set()
self.urls = dict() # Dictionary of URLs that have been checked, with their broken status and status code
self.domains = dict() # Dictionary of URL domains and when they were last requested from (429 code)
self.unchecked_urls = set() # Initial set of urls to be checked
self.delayed_urls = [] # List of urls to be checked later (429 code)
self.directory = '.'
self.pool = ThreadPool(THREADS)
self.lock = Lock()
if not config and config_file_path:
logging.info('Using Linkie configuration file {}'.format(config_file_path))
config = self.read_config(config_file_path)
Expand Down Expand Up @@ -90,6 +96,13 @@ def process_config(self, config):
file_types[i] = '.' + file_types[i]
config['file-types'] = tuple(file_types)
return config

def get_domain(self, url):
# Return everything before the third /
# i.e https://example.com/subpage/?hello-there&general-kenobi
# becomes https://example.com
url_parts = url.split('/')
return '/'.join(url_parts[:3])

def count_broken_links(self):
count = 0
Expand Down Expand Up @@ -122,6 +135,19 @@ def traverse_directory(self):
self.search_file(os.path.join(directory_root, filename))
self.pool.map(self.check_link, self.unchecked_urls)

repeat_count = 1
max_repeats = 1000000
while len(self.delayed_urls) > 0 and repeat_count <= max_repeats:
# Many iterations are expected because the timeout may still be going each time this repeats itself, so the pool map will end immediately
# Only uncomment this line if debugging locally
# print('Retrying delayed urls **MANY ITERATIONS ARE EXPECTED** #{}'.format(repeat_count), end='\r')
repeat_urls = self.delayed_urls[:]
self.delayed_urls = []
self.pool.map(self.check_link, repeat_urls)
repeat_count += 1
if repeat_count > max_repeats:
logging.critical("Infinite loop in retrying delayed urls. The timeout period can't have ended!")

def traverse_connection_errors(self):
connect_errors = []
for url, url_data in self.urls.items():
Expand Down Expand Up @@ -151,12 +177,26 @@ def search_file(self, file_path):
# [Wikipedia link](http://foo.com/blah_blah_(wikipedia))
if url.count('('):
url += url.count('(') * ')'
self.domains[self.get_domain(url)] = -1
self.unchecked_urls.add(url)

def check_link(self, url):
message = ' - Checking URL {} '.format(url)
domain = self.get_domain(url)
self.lock.acquire()
time_at_429 = self.domains[domain]
is_ready = time_at_429 < 0 or time.perf_counter() - time_at_429 > TIMEOUT
if is_ready:
self.domains[domain] = -1
self.lock.release()
if not is_ready:
# Put the url back to be checked later
self.lock.acquire()
self.delayed_urls.append(url)
self.lock.release()
return
message = ' - '
if url in self.config['skip-urls']:
message += '= skipping URL (as defined in config file)'
message += 'Skipping URL (as defined in config file)'
elif url not in self.urls:
try:
status_code = requests.head(url, headers=HEADERS).status_code
Expand All @@ -168,16 +208,26 @@ def check_link(self, url):
status_code = str(type(e).__name__)

if type(status_code) == str:
message += '= {}'.format(status_code)
message += '{}'.format(status_code)
else:
message += '= {} status'.format(status_code)
message += 'Status {}'.format(status_code)

if type(status_code) == str or status_code >= 400:
self.save_url(url, status_code, True)
if status_code == 429: # Too many requests
message += " => Delaying requests to the domain {} for {} seconds".format(domain, TIMEOUT)
self.lock.acquire()
# Save the time the request was made
self.domains[domain] = time.perf_counter()
# Put the url back to be checked again later
self.delayed_urls.append(url)
self.lock.release()
else:
self.save_url(url, status_code, True)
else:
self.save_url(url, status_code, False)
else:
message += '= {} (already checked)'.format(self.urls[url]['status'])
message += '{} (already checked)'.format(self.urls[url]['status'])
message += ' = {}'.format(url)
logging.info(message)

def save_url(self, url, status_code, broken):
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
PyYaml==5.1.1
PyYaml==5.3.1
requests==2.22.0

0 comments on commit fd9bdda

Please sign in to comment.