Skip to content

Commit

Permalink
Merge branch 'InternetHealthReport:main' into test
Browse files Browse the repository at this point in the history
  • Loading branch information
MAVRICK-1 authored Feb 7, 2024
2 parents 0dbf98c + c9ce0ed commit 310340a
Show file tree
Hide file tree
Showing 17 changed files with 396 additions and 113 deletions.
9 changes: 4 additions & 5 deletions config.json.example
Original file line number Diff line number Diff line change
Expand Up @@ -63,26 +63,26 @@
"iyp.crawlers.peeringdb.ix",
"iyp.crawlers.cloudflare.top100",
"iyp.crawlers.tranco.top1M",
"iyp.crawlers.cisco.umbrella_top1M",
"iyp.crawlers.openintel.tranco1m",
"iyp.crawlers.openintel.umbrella1m",
"iyp.crawlers.openintel.infra_ns",
"iyp.crawlers.openintel.infra_mx",
"iyp.crawlers.cisco.umbrella_top1M",
"iyp.crawlers.citizenlab.urldb",
"iyp.crawlers.inetintel.as_org",
"iyp.crawlers.pch.daily_routing_snapshots_v4",
"iyp.crawlers.pch.daily_routing_snapshots_v6",
"iyp.crawlers.emileaben.as_names",
"iyp.crawlers.ripe.atlas_probes",
"iyp.crawlers.ripe.atlas_measurements",
"iyp.crawlers.iana.root_zone",
"iyp.crawlers.alice_lg.amsix",
"iyp.crawlers.alice_lg.bcix",
"iyp.crawlers.alice_lg.decix",
"iyp.crawlers.alice_lg.ixbr",
"iyp.crawlers.alice_lg.linx",
"iyp.crawlers.alice_lg.megaport",
"iyp.crawlers.alice_lg.netnod",
"iyp.crawlers.openintel.dns_dependency_nl",
"iyp.crawlers.openintel.dns_dependency_jp",
"iyp.crawlers.cloudflare.dns_top_locations",
"iyp.crawlers.cloudflare.dns_top_ases"
],
Expand All @@ -91,8 +91,7 @@
"iyp.post.ip2prefix",
"iyp.post.address_family",
"iyp.post.country_information",
"iyp.post.dns_hierarchy",
"iyp.post.url2domain"
"iyp.post.url2hostname"
]
}
}
8 changes: 5 additions & 3 deletions iyp/crawlers/cisco/README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
# Cisco Umbrella -- https://umbrella-static.s3-us-west-1.amazonaws.com/index.html

The popularity list contains most queried domains based on passive DNS usage across the Umbrella global network.
The popularity list contains most queried domains (ranging from TLDs to FQDNs)
based on passive DNS usage across the Umbrella global network.

IYP uses this data to create and annotate DomainName nodes.
IYP uses this data to create and annotate DomainName and HostName nodes.

## Graph representation

The rank of the domain is indicated by the `rank` property of the relationship.

```Cypher
(:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
(:HostName {name: 'www.google.com'})-[:RANK {rank: 8}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
```

## Dependence

This crawler is not depending on other crawlers.
This crawler depends on `openintel.umbrella1m`.
61 changes: 53 additions & 8 deletions iyp/crawlers/cisco/umbrella_top1M.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from zipfile import ZipFile

import requests
import tldextract

from iyp import BaseCrawler, RequestStatusError

Expand All @@ -22,31 +23,75 @@ def run(self):

self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'})

sys.stderr.write('Downloading latest list...\n')
logging.info('Downloading latest list...')
req = requests.get(URL)
if req.status_code != 200:
raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file')

links = []
domains = set()
# open zip file and read top list
with ZipFile(io.BytesIO(req.content)) as z:
with z.open('top-1m.csv') as list:
for i, row in enumerate(io.TextIOWrapper(list)):
with z.open('top-1m.csv') as top_list:
for i, row in enumerate(io.TextIOWrapper(top_list)):
row = row.rstrip()
rank, domain = row.split(',')

domains.add(domain)
links.append({'src_name': domain, 'dst_id': self.cisco_qid,
'props': [self.reference, {'rank': int(rank)}]})

name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains)
logging.info('Fetching DomainName/HostName nodes...')
domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name')
host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name')

# Umbrella mixes up domain and host names.
# By order of preferences we rank:
# 1) existing domain name
# 2) existing host name
# 3) do our best to figure out if it is a domain or host and create the
# corresponding node

new_domain_names = set()
new_host_names = set()
unprocessed_links = list()
processed_links = list()

logging.info('Building relationships...')
for link in links:
link['src_id'] = name_id[link['src_name']]
if link['src_name'] in domain_id:
link['src_id'] = domain_id[link['src_name']]
processed_links.append(link)
elif link['src_name'] in host_id:
link['src_id'] = host_id[link['src_name']]
processed_links.append(link)
else:
unprocessed_links.append(link)
ranked_thing = tldextract.extract(link['src_name'])
name = link['src_name']
if name == ranked_thing.registered_domain:
new_domain_names.add(name)
else:
new_host_names.add(name)

if new_domain_names:
logging.info(f'Pushing {len(new_domain_names)} additional DomainName nodes...')
domain_id.update(self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', new_domain_names, all=False))
if new_host_names:
logging.info(f'Pushing {len(new_host_names)} additional HostName nodes...')
host_id.update(self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', new_host_names, all=False))

for link in unprocessed_links:
if link['src_name'] in domain_id:
link['src_id'] = domain_id[link['src_name']]
elif link['src_name'] in host_id:
link['src_id'] = host_id[link['src_name']]
else:
logging.error(f'Missing DomainName/HostName node for name "{link["src_name"]}". Should not happen.')
continue
processed_links.append(link)

# Push all links to IYP
self.iyp.batch_add_links('RANK', links)
logging.info(f'Pushing {len(processed_links)} RANK relationships...')
self.iyp.batch_add_links('RANK', processed_links)


def main() -> None:
Expand Down
8 changes: 6 additions & 2 deletions iyp/crawlers/cloudflare/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Cloudflare Radar -- https://radar.cloudflare.com/
# Cloudflare Radar -- https://radar.cloudflare.com/

Cloudflare uses aggregated and anonymized DNS queries to their `1.1.1.1` public resolver service to
provide various datasets, including:
Expand All @@ -17,8 +17,12 @@ provide various datasets, including:
- [Top 100 ASes querying each of the 10,000 highest ranked domain
names](https://developers.cloudflare.com/api/operations/radar_get__top_ases): Same as above, but
fetch AS numbers instead.

All rankings are based on one week of data.
Cloudflare radar's top location and ASes is available for both domain names
and host names. Results are likely accounting for all NS, A, AAAA queries made to
Cloudflare's resolver. Since NS queries for host names make no sense IYP links these
results to `DomainName` nodes.

## Graph representation

Expand Down
5 changes: 5 additions & 0 deletions iyp/crawlers/cloudflare/dns_top_ases.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Cloudflare radar's top location and ASes is available for both domain names
# and host names. Results are likely accounting for all NS, A, AAAA queries made to
# Cloudflare's resolver. Since NS queries for host names make no sense it seems
# more intuitive to link these results to DomainName nodes.

import argparse
import logging
import os
Expand Down
5 changes: 5 additions & 0 deletions iyp/crawlers/cloudflare/dns_top_locations.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Cloudflare radar's top location and ASes is available for both domain names
# and host names. Results are likely accounting for all NS, A, AAAA queries made to
# Cloudflare's resolver. Since NS queries for host names make no sense it seems
# more intuitive to link these results to DomainName nodes.

import argparse
import glob
import json
Expand Down
3 changes: 3 additions & 0 deletions iyp/crawlers/cloudflare/ranking_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
class Crawler(BaseCrawler):
# Base Crawler provides access to IYP via self.iyp and setup a dictionary with the
# org/url/today's date in self.reference
#
# Cloudflare ranks second and third level domain names (not host names).
# See https://blog.cloudflare.com/radar-domain-rankings/

def run(self):
"""Fetch data and push to IYP."""
Expand Down
3 changes: 3 additions & 0 deletions iyp/crawlers/cloudflare/top100.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
class Crawler(BaseCrawler):
# Base Crawler provides access to IYP via self.iyp
# and setup a dictionary with the org/url/today's date in self.reference
#
# Cloudflare ranks second and third level domain names (not host names).
# See https://blog.cloudflare.com/radar-domain-rankings/

def run(self):
"""Fetch data and push to IYP."""
Expand Down
20 changes: 14 additions & 6 deletions iyp/crawlers/openintel/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,34 @@ The OpenINTEL measurement platform captures daily snapshots of the state of larg
global Domain Name System (DNS) by running a number of forward and reverse DNS measurements.

While OpenINTEL runs measurements to a variety of domain names, IYP currently only fetches data for
the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella
the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella
top 1 million list since it combines rankings.
IYP also get the list of authoritative names servers seen by OpenINTEL.

IYP uses only `A` queries to add IP resolution for DomainName and AuthoritativeNameServer nodes.

A crawler of mail servers is also implemented but not used as it creates a very large number
of links and this dataset is currently not requested/needed by anyone.

## Graph representation

IP resolution for popular domain names:
IP resolution for popular host names:

```Cypher
(:DomainName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
(:HostName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
```

IP resolution of authoritative name servers:

```Cypher
(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
(:IP {ip: '216.239.32.10'})-[:SERVE]->(:Service {name: 'DNS'})
```

Domain names managed by name servers:

```Cypher
(:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
(:DomainName {name: 'google.com'})-[:MANAGED_BY]->(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})
```

## Dependence

This crawler is not depending on other crawlers.
Loading

0 comments on commit 310340a

Please sign in to comment.