From 3d7fb1f2b0e1d102342995a6e08431b67db1be60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Sagnes?= Date: Sat, 27 Apr 2019 14:37:49 +0100 Subject: [PATCH] Handle domain parsing correctly #10 --- build_rules.py | 28 +++++++++++++++++++++------- test_build_rules.py | 5 +++++ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/build_rules.py b/build_rules.py index 24cfab4..eb4694e 100755 --- a/build_rules.py +++ b/build_rules.py @@ -13,6 +13,16 @@ OUTPUT_HOSTS_PATH = 'hosts.txt' OUTPUT_DOMAINS_PATH = 'domains.txt' BLACKHOLE_IP = '0.0.0.0' +DOMAIN_EXTENSIONS_URL = 'https://publicsuffix.org/list/public_suffix_list.dat' +DOMAIN_EXTENSIONS = None + +def get_domain_extensions(): + global DOMAIN_EXTENSIONS + + if DOMAIN_EXTENSIONS == None: + DOMAIN_EXTENSIONS = frozenset(parse_host_file(DOMAIN_EXTENSIONS_URL)) + + return DOMAIN_EXTENSIONS def download_file(url): request = urllib2.Request(url) @@ -33,12 +43,16 @@ def cleanup_domain_line(line): return line def is_domain(domain): - # Take .co.uk into account? - if domain.count('.') > 1: - return False - else: + if not '.' in domain: + return True + + extension = domain[domain.index('.') + 1:] + + if extension in get_domain_extensions(): return True + return False + def parse_domain_line(line): original_line = line line = cleanup_domain_line(line) @@ -71,7 +85,7 @@ def parse_host_file(url): yield domain if not found_domains: - raise Exception('Couldn\'t find any domains in that URL') + raise Exception('Couldn\'t find any domains in URL %s' % url) def output_hosts(ads_lists_ulrs=FIREBOG_CONSERVATIVE_URLS_LIST, output_hosts_path=OUTPUT_HOSTS_PATH, output_domains_path=OUTPUT_DOMAINS_PATH, blackhole_ip=BLACKHOLE_IP): ads_lists = download_ads_list_urls(ads_lists_ulrs) @@ -87,9 +101,9 @@ def output_hosts(ads_lists_ulrs=FIREBOG_CONSERVATIVE_URLS_LIST, output_hosts_pat with open(output_domains_path, 'w') as domains_file: for domain in domains: if is_domain(domain): - hosts_file.write('%s %s\n' % (blackhole_ip, domain)) - else: domains_file.write('%s %s\n' % (blackhole_ip, domain)) + else: + hosts_file.write('%s %s\n' % (blackhole_ip, domain)) print 'Wrote %d host names in %s and %s' % (len(domains), output_hosts_path, output_domains_path) diff --git a/test_build_rules.py b/test_build_rules.py index 59325f2..7664f97 100755 --- a/test_build_rules.py +++ b/test_build_rules.py @@ -55,6 +55,11 @@ def test_Hostsfile(self): self.assertEqual(parse_domain_line('127.0.0.1 005.free-counter.co.uk'), '005.free-counter.co.uk') self.assertEqual(parse_domain_line('127.0.0.1 118d654612df63bc8395-aecfeaabe29a34ea9a877711ec6d8aed.r37.cf2.rackcdn.com'), '118d654612df63bc8395-aecfeaabe29a34ea9a877711ec6d8aed.r37.cf2.rackcdn.com') + def test_get_domain_extensions(self): + domain_extensions = get_domain_extensions() + self.assertTrue('com' in domain_extensions) + self.assertTrue('co.uk' in domain_extensions) + def test_is_domain(self): self.assertTrue(is_domain('google.com')) self.assertFalse(is_domain('www.google.com'))