-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
142 lines (127 loc) · 5.47 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Tested on: Python 3.7.3 (MacOS)
import requests
import os.path
import schedule
import time
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from eliot import start_action, to_file
# Overwrites log file in every program iteration
# Replace "w+" with "a+", if you want to append
to_file(open("log", "w+"))
# Check urls.txt presence, exit otherwise
if os.path.isfile('urls.txt'):
pass
else:
print('[-] ERR: urls.txt missing\n')
exit(-1)
def do_crawl():
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'
headers = {'User-Agent': user_agent}
urls = []
# Read urls.txt
with open('urls.txt', 'r') as fObj:
filecontents = fObj.readlines()
for line in filecontents:
current_place = line[:-1]
urls.append(current_place)
# Initialize web crawler
with start_action(action_type="domains_iterator"):
print('[+] Exec on: %s' % str(time.strftime("%c")))
for url in urls:
try:
with start_action(action_type="root_domain", source=url):
response = requests.head(url, verify=False, timeout=10,
headers=headers)
final_url = response.url
with start_action(action_type="redirect_url", target=final_url):
http_req = requests.get(final_url, verify=False, timeout=10,
headers=headers)
http_req.raise_for_status()
except requests.exceptions.ConnectionError as e:
pass
except requests.exceptions.HTTPError as e:
pass
except requests.exceptions.ReadTimeout as e:
pass
except Exception as e:
pass
# Monday (Offset: 40 minutes)
schedule.every().monday.at("08:10").do(do_crawl)
schedule.every().monday.at("08:50").do(do_crawl)
schedule.every().monday.at("09:30").do(do_crawl)
schedule.every().monday.at("10:10").do(do_crawl)
schedule.every().monday.at("10:50").do(do_crawl)
schedule.every().monday.at("11:30").do(do_crawl)
schedule.every().monday.at("12:10").do(do_crawl)
schedule.every().monday.at("12:50").do(do_crawl)
schedule.every().monday.at("13:30").do(do_crawl)
schedule.every().monday.at("14:10").do(do_crawl)
schedule.every().monday.at("14:50").do(do_crawl)
schedule.every().monday.at("15:30").do(do_crawl)
schedule.every().monday.at("16:10").do(do_crawl)
schedule.every().monday.at("16:50").do(do_crawl)
# Tuesday
schedule.every().tuesday.at("08:10").do(do_crawl)
schedule.every().tuesday.at("08:50").do(do_crawl)
schedule.every().tuesday.at("09:30").do(do_crawl)
schedule.every().tuesday.at("10:10").do(do_crawl)
schedule.every().tuesday.at("10:50").do(do_crawl)
schedule.every().tuesday.at("11:30").do(do_crawl)
schedule.every().tuesday.at("12:10").do(do_crawl)
schedule.every().tuesday.at("12:50").do(do_crawl)
schedule.every().tuesday.at("13:30").do(do_crawl)
schedule.every().tuesday.at("14:10").do(do_crawl)
schedule.every().tuesday.at("14:50").do(do_crawl)
schedule.every().tuesday.at("15:30").do(do_crawl)
schedule.every().tuesday.at("16:10").do(do_crawl)
schedule.every().tuesday.at("16:50").do(do_crawl)
# Wednesday
schedule.every().wednesday.at("08:10").do(do_crawl)
schedule.every().wednesday.at("08:50").do(do_crawl)
schedule.every().wednesday.at("09:30").do(do_crawl)
schedule.every().wednesday.at("10:10").do(do_crawl)
schedule.every().wednesday.at("10:50").do(do_crawl)
schedule.every().wednesday.at("11:30").do(do_crawl)
schedule.every().wednesday.at("12:10").do(do_crawl)
schedule.every().wednesday.at("12:50").do(do_crawl)
schedule.every().wednesday.at("13:30").do(do_crawl)
schedule.every().wednesday.at("14:10").do(do_crawl)
schedule.every().wednesday.at("14:50").do(do_crawl)
schedule.every().wednesday.at("15:30").do(do_crawl)
schedule.every().wednesday.at("16:10").do(do_crawl)
schedule.every().wednesday.at("16:50").do(do_crawl)
# Thursday
schedule.every().thursday.at("08:10").do(do_crawl)
schedule.every().thursday.at("08:50").do(do_crawl)
schedule.every().thursday.at("09:30").do(do_crawl)
schedule.every().thursday.at("10:10").do(do_crawl)
schedule.every().thursday.at("10:50").do(do_crawl)
schedule.every().thursday.at("11:30").do(do_crawl)
schedule.every().thursday.at("12:10").do(do_crawl)
schedule.every().thursday.at("12:50").do(do_crawl)
schedule.every().thursday.at("13:30").do(do_crawl)
schedule.every().thursday.at("14:10").do(do_crawl)
schedule.every().thursday.at("14:50").do(do_crawl)
schedule.every().thursday.at("15:30").do(do_crawl)
schedule.every().thursday.at("16:10").do(do_crawl)
schedule.every().thursday.at("16:50").do(do_crawl)
# Friday
schedule.every().friday.at("08:10").do(do_crawl)
schedule.every().friday.at("08:50").do(do_crawl)
schedule.every().friday.at("09:30").do(do_crawl)
schedule.every().friday.at("10:10").do(do_crawl)
schedule.every().friday.at("10:50").do(do_crawl)
schedule.every().friday.at("11:30").do(do_crawl)
schedule.every().friday.at("12:10").do(do_crawl)
schedule.every().friday.at("12:50").do(do_crawl)
schedule.every().friday.at("13:30").do(do_crawl)
schedule.every().friday.at("14:10").do(do_crawl)
schedule.every().friday.at("14:50").do(do_crawl)
schedule.every().friday.at("15:30").do(do_crawl)
schedule.every().friday.at("16:10").do(do_crawl)
schedule.every().friday.at("16:50").do(do_crawl)
# Run the task indefinitely (unless stopped explicitly)
while True:
schedule.run_pending()
time.sleep(1)