-
Notifications
You must be signed in to change notification settings - Fork 12
/
app.py
235 lines (180 loc) · 7.87 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import requests
import functools
import shutil
import codecs
import sys
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
# URL of the web page you want to extract data from
url = "https://google.com"
use_tor_network = False
if len(sys.argv) > 1: url = sys.argv[1]
output_folder = urlparse(url).netloc
# initialize a session
session = requests.session()
if use_tor_network:
session.request = functools.partial(session.request, timeout=30)
session.proxies = {'http': 'socks5h://localhost:9050',
'https': 'socks5h://localhost:9050'}
# define workspace from script location
workspace = os.path.dirname(os.path.realpath(__file__))
class Extractor:
def __init__(self, url):
self.url = url
self.soup = BeautifulSoup(self.get_page_content(url), "html.parser")
self.scraped_urls = self.scrap_all_urls()
def run(self):
self.save_files(self.scraped_urls)
self.save_html()
def get_page_content(self, url):
try:
content = session.get(url)
content.encoding = 'utf-8'
return content.text
except: return None
# get the script files
def scrap_scripts(self):
script_urls = []
for script_tag in self.soup.find_all("script"):
# if the tag has the attribute 'script'
script_url = script_tag.attrs.get("src")
if script_url:
if not script_url.startswith('http'): script_url = urljoin(self.url, script_url)
else: continue
new_url = self.url_to_local_path(script_url, keepQuery=True)
if new_url:
script_tag['src'] = new_url
script_urls.append(script_url.split('?')[0])
return list(dict.fromkeys(script_urls))
# get attributes
def scrap_form_attr(self):
urls = []
for form_tag in self.soup.find_all("form"):
# if the tag has the attribute 'action'
form_url = form_tag.attrs.get("action")
if form_url:
if not form_url.startswith('http'): form_url = urljoin(self.url, form_tag.attrs.get("action"))
new_url = self.url_to_local_path(form_url, keepQuery=True)
if new_url:
form_tag['action'] = new_url
urls.append(form_url.split('?')[0])
return list(dict.fromkeys(urls))
def scrap_a_attr(self):
urls = []
for link_tag in self.soup.find_all('a'):
# if the tag has the attribute 'href'
link_url = link_tag.attrs.get('href')
if link_url:
if not link_url.startswith('http'): link_url = urljoin(self.url, link_tag.attrs.get('href'))
new_url = self.url_to_local_path(link_url, keepQuery=True)
if new_url:
link_tag['href'] = new_url
urls.append(link_url.split('?')[0])
return list(dict.fromkeys(urls))
def scrap_img_attr(self):
urls = []
for img_tag in self.soup.find_all('img'):
# if the tag has the attribute 'src'
img_url = img_tag.attrs.get('src')
if img_url:
if not img_url.startswith('http'): img_url = urljoin(self.url, img_tag.attrs.get('src'))
new_url = self.url_to_local_path(img_url, keepQuery=True)
if new_url:
img_tag['src'] = new_url
urls.append(img_url.split('?')[0])
return list(dict.fromkeys(urls))
def scrap_link_attr(self):
urls = []
for link_tag in self.soup.find_all('link'):
# if the tag has the attribute 'href'
link_url = link_tag.attrs.get('href')
if link_url:
if not link_url.startswith('http'): link_url = urljoin(self.url, link_tag.attrs.get('href'))
new_url = self.url_to_local_path(link_url, keepQuery=True)
if new_url:
link_tag['href'] = new_url
urls.append(link_url.split('?')[0])
return list(dict.fromkeys(urls))
def scrap_btn_attr(self):
urls = []
for buttons in self.soup.find_all('button'):
button_url = buttons.attrs.get('onclick')
if not button_url: return None
button_url = button_url.replace(' ','')
button_url = button_url[button_url.find('location.href='):].replace('location.href=','')
button_url = button_url.replace('\'', '')
button_url = button_url.replace('\"', '')
button_url = button_url.replace('`', '')
if button_url and button_url.startswith('/'):
if not button_url.startswith('http'): button_url = urljoin(self.url, buttons.get('onclick'))
new_url = self.url_to_local_path(button_url, keepQuery=True)
if new_url:
buttons['onclick'] = new_url
urls.append(button_url.split('?')[0])
return list(dict.fromkeys(urls))
# get assets (img and more)
def scrap_assets(self):
assets_urls = []
form_attr = self.scrap_form_attr()
a_attr = self.scrap_a_attr()
img_attr = self.scrap_img_attr()
link_attr = self.scrap_link_attr()
btn_attr = self.scrap_btn_attr()
if form_attr: assets_urls = list(set(assets_urls + form_attr))
if a_attr: assets_urls = list(set(assets_urls + a_attr))
if img_attr: assets_urls = list(set(assets_urls + img_attr))
if link_attr: assets_urls = list(set(assets_urls + link_attr))
if btn_attr: assets_urls = list(set(assets_urls + btn_attr))
return assets_urls
# scrap every urls
def scrap_all_urls(self):
urls = []
urls.extend(self.scrap_scripts())
urls.extend(self.scrap_assets())
return list(dict.fromkeys(urls))
# convert url to into local path
def url_to_local_path(self, url, keepQuery=False):
try:
new_url = urlparse(url).path
query = urlparse(url).query
if keepQuery and query: new_url += '?' + urlparse(url).query
if (new_url[0] == '/') or (new_url[0] == '\\'): new_url = new_url[1:]
except:
return None
return new_url
# download file from URL
def download_file(self, url, output_path):
# Remove query string and http from URL
url = url.split('?')[0]
file_name = url.split('/')[-1]
if len(file_name) == 0: return False
# Create output directory
if not os.path.exists(os.path.dirname(output_path)):
os.makedirs(os.path.dirname(output_path))
# Get file content and save it
response = session.get(url)
with open(output_path, "wb") as file:
file.write(response.content)
print(f"Downloaded {file_name} to {os.path.relpath(output_path)}")
return True
def save_files(self, urls):
shutil.rmtree(os.path.join(workspace, output_folder), ignore_errors=True)
for url in urls:
output_path = self.url_to_local_path(url, keepQuery=False)
output_path = os.path.join(workspace, output_folder, output_path)
self.download_file(url, output_path)
return True
# save the HTML file
def save_html(self):
output_path = os.path.join(workspace, output_folder,'index.html')
prettyHTML = self.soup.prettify()
with codecs.open(output_path, 'w', 'utf-8') as file:
file.write(prettyHTML)
file.close()
print(f"Saved index.html to {os.path.relpath(output_path)}")
return True
extractor = Extractor(url)
print(f"Extracting files from {url}\n")
extractor.run()
print(f"\nTotal extracted files: {len(extractor.scraped_urls)}")