-
-
Notifications
You must be signed in to change notification settings - Fork 5
/
connection_tree_v2.py
82 lines (68 loc) · 2.63 KB
/
connection_tree_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from rich import print
import requests
import random
import sys
import re
import pickle
# regex patterns
url_extract_pattern = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
url_pattern = "^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$"
url_pattern_0 = "^[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$"
url_extract_pattern_0 = "[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
# Main Variables
website = sys.argv[1] # website to be scanned
num = int(sys.argv[2]) # number of layers to scan
DATA = {}
def get_proxy():
"""
Gets a free proxy from 'proxyscrape'
returns : dict - > {"http": "<proxy ip:port>"}
"""
res = requests.get("https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all")
return {"http": random.choice(res.text.split("\r\n"))}
def scan(website, max_, it, parent_node):
"""
Scans for sub URLs and adds them to the DATA dictionary.
website : Str
max_ : int
it : int
parent_node : dict
"""
if max_ != it:
print(" "*it + "[green]----" + website + ":[/green]")
else:
print(" "*it + "[green]----" + website + "[/green]")
return None
# Gets a proxy
try:
proxies = get_proxy()
except:
proxies = {}
try:
website_txt = requests.get(website, headers={"user-agent": "open crawler Mapper v 0.0.1"}, proxies=proxies).text
except:
website_txt = ""
print(f"[red] [-] '{website}' Website Couldn't Be Loaded")
sub_urls = []
for x in re.findall(url_extract_pattern, website_txt):
if re.match(url_pattern, x):
if ".onion" in x:
# skips onion sites
continue
if x[-1] == "/" or x.endswith(".html") or x.split("/")[-1].isalnum():
# tries to filter out non-crawlable urls
sub_urls.append(x)
# removes all duplicates
sub_urls = set(sub_urls)
if not parent_node.get("children"):
parent_node["children"] = []
for e in sub_urls:
child_node = {"name": e}
parent_node["children"].append(child_node)
scan(e, max_, it + 1, child_node)
print(f"[dark_orange]Scanning :{website} | No. of Layers : {num} [/dark_orange]\n")
DATA[website] = {"name": website}
scan(website, num, 1, DATA[website])
with open(f".{website}_{num}".replace("/","o"), "wb") as f:
f.write(pickle.dumps(DATA))
print(DATA)