diff --git a/app/tasks/parse_scans.py b/app/tasks/parse_scans.py index 6897d88..16adeb2 100644 --- a/app/tasks/parse_scans.py +++ b/app/tasks/parse_scans.py @@ -13,13 +13,16 @@ def parse_scan(queue_id): with app.app_context(): this_onion = this_page = this_url = None - # We pass the queue id from the tasker, so this runs immediately on the specific queued item. + # We pass the queue id from the tasker, so this runs immediately on + # the specific queued item. queue_item = ParseQueue.query.filter(ParseQueue.id == queue_id).first() if queue_item: try: scan_result = json.loads(queue_item.parse_data) except: - app.logger.critical('Failed to parse the scan results for id {}. JSON loading error'.format(queue_id)) + app.logger.critical( + 'Failed to parse the scan results for id {}. ' + \ + 'JSON loading error'.format(queue_id)) try: db.session.delete(queue_item) db.session.commit() @@ -28,7 +31,8 @@ def parse_scan(queue_id): # Don't continue to process if we don't have a scan_result. return False else: - # We don't have any data for some reason, something isn't right, but we'll move on. + # We don't have any data for some reason, something isn't right, + # but we'll move on. return False # Extract the values from scan_result. @@ -57,16 +61,19 @@ def parse_scan(queue_id): if not this_onion: # We couldn't find the onion, which is strange, so we'll skip out - app.logger.critical('Could not find onion domain for parsing: {}'.format(domain)) + app.logger.critical( + 'Could not find onion domain for parsing: ' + \ + '{}'.format(domain)) return False if not this_url: # We couldn't find the url, which is strange, so we'll skip out - app.logger.critical('Could not find url for parsing: {}'.format(url)) + app.logger.critical( + 'Could not find url for parsing: {}'.format(url)) return False # Process the domain depending on whether the domain is online or not. if online: - # If the url is online, update onions and set last_online to scan_date, - # tries to 0, and offline_scans to 0. + # If the url is online, update onions and set last_online to + # scan_date, tries to 0, and offline_scans to 0. this_onion.last_online = scan_date this_onion.scan_date = scan_date this_onion.tries = 0 @@ -91,20 +98,30 @@ def parse_scan(queue_id): this_onion.base_url = redirect elif not fault: this_onion.base_url = url_page - if url_page == this_onion.base_url and not fault: + else: + # There was a fault, but no redirect. + this_onion.base_url = 'Unknown' + + # If this is the onion's base url, set the onion's title. + if url_page == this_onion.base_url: # If this is the base url, set the title of the page, but not - # if there was a fault with the page. - # NOTE: This could result in the domain never having a title, - # if the root domain has a fault and no redirect. - this_onion.title = this_url.title + # if there was a fault with the page. If there was a fault, set + # the title to Unknown. + if not fault: + this_onion.title = this_url.title + else: + this_onion.title = 'Unknown' # Update the page's hash if the hash is set. if hash: this_url.hash = hash + # If we found a title, update it if title: # Update the url's title. - if this_url.title != 'Unknown' and this_url.title != '' and this_url.title != 'none': + if this_url.title != 'Unknown' \ + and this_url.title != '' \ + and this_url.title != 'none': this_url.title = merge_titles(this_url.title, title) else: this_url.title = title @@ -115,6 +132,7 @@ def parse_scan(queue_id): db.session.commit() except: db.session.rollback() + # If the url is online and there is no fault, process_url. if not fault: process_url(url) @@ -123,8 +141,11 @@ def parse_scan(queue_id): if title: if this_page: # Update the page's title. - if this_page.title != 'Unknown' and this_page.title != '' and this_page.title != 'none': - this_page.title = merge_titles(this_page.title, title) + if this_page.title != 'Unknown' \ + and this_page.title != '' \ + and this_page.title != 'none': + this_page.title = merge_titles( + this_page.title, title) # After merging, if we wind up with an empty title, # just use the base onion's title. if this_page.title == '' \ @@ -138,19 +159,23 @@ def parse_scan(queue_id): db.session.commit() except: db.session.rollback() + # For every new_url in the new_urls list, add_to_queue the url. for new_url in new_urls: add_to_queue(new_url, domain) + else: # If the url is offline, increment tries. If tries >= 3, set - # tries = 0 and onion as offline, then set offline_scans += 1. Then set - # the onion scan_date to the current date + offline_scans. + # tries = 0 and onion as offline, then set offline_scans += 1. Then + # set the onion scan_date to the current date + offline_scans. this_onion.tries += 1 if this_onion.tries >= 3: this_onion.offline_scans += 1 this_onion.tries = 0 + this_onion.online = False # Set the scan date and last node - this_onion.scan_date = (scan_date + timedelta(days=this_onion.offline_scans)).strftime('%Y-%m-%d') + this_onion.scan_date = (scan_date + timedelta( + days=this_onion.offline_scans)).strftime('%Y-%m-%d') this_onion.last_node = last_node # Set the date of the url to scan_date. this_url.date = this_onion.scan_date @@ -176,6 +201,7 @@ def add_to_queue(link_url, origin_domain): return add_onion(link_domain) add_url(link_domain, link_url) + add_url(link_domain, get_base(link_url)) add_link(origin_domain, link_domain) @@ -244,7 +270,8 @@ def add_form(page, field): insert_stmt = insert(Forms).values( page=page, field=field) - do_nothing_stmt = insert_stmt.on_conflict_do_nothing(index_elements=['page', 'field']) + do_nothing_stmt = insert_stmt.on_conflict_do_nothing( + index_elements=['page', 'field']) db.engine.execute(do_nothing_stmt) try: db.session.commit() @@ -257,7 +284,8 @@ def add_onion(link_domain): # Only add a domain if the domain isn't already in the database. insert_stmt = insert(Onions).values( domain=link_domain) - do_nothing_stmt = insert_stmt.on_conflict_do_nothing(index_elements=['domain']) + do_nothing_stmt = insert_stmt.on_conflict_do_nothing( + index_elements=['domain']) db.engine.execute(do_nothing_stmt) try: db.session.commit() @@ -285,7 +313,8 @@ def add_url(link_domain, link_url): insert_stmt = insert(Urls).values( domain=link_domain, url=link_url) - do_nothing_stmt = insert_stmt.on_conflict_do_nothing(index_elements=['domain', 'url']) + do_nothing_stmt = insert_stmt.on_conflict_do_nothing( + index_elements=['domain', 'url']) db.engine.execute(do_nothing_stmt) try: db.session.commit() @@ -302,7 +331,8 @@ def add_link(origin_domain, link_domain): insert_stmt = insert(Links).values( domain_from=origin_domain, domain_to=link_domain) - do_nothing_stmt = insert_stmt.on_conflict_do_nothing(index_elements=['domain_from', 'domain_to']) + do_nothing_stmt = insert_stmt.on_conflict_do_nothing( + index_elements=['domain_from', 'domain_to']) db.engine.execute(do_nothing_stmt) try: db.session.commit() @@ -320,7 +350,9 @@ def get_form(page, field): def update_form(page, field, examples): # Update the forms table, filling in examples for the specified field. try: - update = Forms.query.filter(Forms.page == page, Forms.field == field).first() + update = Forms.query.filter( + Forms.page == page, + Forms.field == field).first() update.examples = examples db.session.merge(update) db.session.commit() diff --git a/app/tasks/populate_url_queue.py b/app/tasks/populate_url_queue.py index 138be44..42a09aa 100644 --- a/app/tasks/populate_url_queue.py +++ b/app/tasks/populate_url_queue.py @@ -13,10 +13,25 @@ def repopulate_queue(): if url_count > 100: # Do nothing return True + # Empty the current table and re-build the queue. + # We are emptying because some of the items in there may no longer be 'good' + # Empty the queue + db.session.query(UrlQueue).delete() week_ago = (date.today() - timedelta(days=7)) day_ago = (date.today() - timedelta(days=1)) + # Force unscanned onions first, so we can get better statistics about which onions are online if we find new ones + unscanned_onions = db.session.query(Onions).filter( + Onions.scan_date == date(1900, 1, 1) + ).limit(1000).all() + unscanned_list = [] + for onion in unscanned_onions: + # Grab the first matching url to the onion and add it to the scan list + url = db.session.query(Urls.url).filter(Urls.domain == onion.domain).first() + if not url.url.startswith('http'): + continue + unscanned_list.append(url.url) # It seems we don't need to populate the queue with millions of Urls if we repopulate it every 5 min or so. - # 10k should be ok. Can be adjusted later. + # 1k should be ok. Can be adjusted later. candidates = db.session.query(Urls.url).join(Onions).filter( or_( and_( @@ -31,10 +46,14 @@ def repopulate_queue(): ) ) ).order_by(db.func.random()).limit(1000).all() - # Empty the current table and re-build the queue. - # We are emptying because some of the items in there may no longer be 'good' - # Empty the queue - db.session.query(UrlQueue).delete() + for candidate in unscanned_list: + try: + # Rebuild the queue + q = UrlQueue() + q.url = candidate.url + db.session.merge(q) + except: + db.session.rollback() for candidate in candidates: # Let's not queue non-http urls for now. if not candidate.url.startswith('http'): diff --git a/app/useful_functions.py b/app/useful_functions.py index 7fa010f..56b6de2 100644 --- a/app/useful_functions.py +++ b/app/useful_functions.py @@ -21,6 +21,12 @@ def fix_url(url): return url.replace('\x00', '') +def get_base(url): + # Get the base url from the given url. + (scheme, netloc, path, query, fragment) = urlsplit(url) + return urlunsplit((scheme, netloc, '', '', '')) + + def get_domain(url): # Get the defragmented domain of the given url. # Omit subdomains. Rather than having separate records for urls diff --git a/app/views/public_api.py b/app/views/public_api.py new file mode 100644 index 0000000..d8c2477 --- /dev/null +++ b/app/views/public_api.py @@ -0,0 +1,116 @@ +from flask import jsonify +from flask import request +from flask import abort +from app import app, db +from app.helpers import check_api_auth +from app.useful_functions import * +from app.models import Urls, UrlQueue +from urllib.parse import urlsplit +import datetime +import json + +# Public API calls. These are available without authentication, so that the +# general public can access them. + +@app.route('/api/onion_info', methods=['GET']) +def onion_info(): + # Get the basic information about the specified onion. + + # Are we authenticated? + # NOTE: As of right now, the returned information is the same regardless + # of authentication. However, in the future, we might want to return + # less data to clients without authentication. + authenticated = False + if check_api_auth(): + # We'll provide more information in an authenticated request. + authenticated = True + + try: + # Get the requested url. + onion_request = json.loads(request.args.get('q'))['node_name'] + except: + # Invalid request. + abort(400) + + # Get the base onion domain. + parts = onion_request.split('/') + for part in parts: + if part.endswith('.onion'): + onion_request = part + break + + try: + # Get the Onion's data and send it off. + onion = Onions.query.filter(Onions.domain == onion).first() + return_value = { + domain = onion.domain, + online = onion.online, + last_online = onion.last_online, + scan_date = onion.scan_date, + base_url = onion.base_url, + title = onion.title + } + return json.dumps({'objects': return_value}) + except: + # If there's an error, return nothing. + return jsonify({"objects": []}) + +@app.route('/api/submit_url', methods=['PUT','POST']) +def submit_url(): + # Add a new url to the database if it doesn't already exist. + + try: + # Get the requested url. + submitted_url = json.loads(request.args.get('q'))['node_name'] + except: + # Invalid request. + abort(400) + + try: + # Add the url and its base onion to the list of urls to be scanned. + + # Add the url as-is. + new_url = Urls() + new_url.url = submitted_url + + # Add the base onion as both http and https. + onion_url_http = Urls() + onion_url_https = Urls() + parts = submitted_url.split('/') + for part in parts: + if part.endswith('.onion'): + onion_url_http.url = 'http://{}/'.format(part) + onion_url_https.url = 'https://{}/'.format(part) + break + + result = { + 'URL': 'Success', + 'HTTP': 'Success', + 'HTTPS': 'Success' + } + # Try adding the new_url. + try: + db.session.add(new_url) + db.session.commit() + except: + db.session.rollback() + result['URL'] = 'Failure' + # Try adding the onion_url_http. + try: + db.session.add(new_url) + db.session.commit() + except: + db.session.rollback() + result['HTTP'] = 'Failure' + # Try adding the onion_url_https. + try: + db.session.add(new_url) + db.session.commit() + except: + db.session.rollback() + result['HTTPS'] = 'Failure' + + return json.dumps({'objects': result}) + except: + # If there's an error, return nothing. + return jsonify({"objects": []})