-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
291 lines (231 loc) · 12.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
print("Hold on! Gathering required Files\n")
import os
import time
import random
import mail_system
import xpaths as x
import pandas as pd
import datetime as dt
import user_input as ui
import advanced_filter as af
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementNotInteractableException, StaleElementReferenceException
# User Input
job_role = ui.role()
job_location = ui.geo()
job_post_time = ui.listing_time()
experiences = ['Internship', 'Entry level', 'Associate', 'Mid-senior level', 'Director', 'Executive', 'Not applicable']
desired_experience_level = ui.job_level()
no_of_jobs = int(ui.list_length())
advanced_filtering = ui.filtering()
if advanced_filtering:
try:
maximum_months_of_experience = int(input('Enter the number of months of experience you have: '))
except ValueError:
print('Invalid Vlaue entered. Run the script again')
exit()
# To optimize internet speed (change value in line 94 of user_input.py if internet is too slow)
t = ui.internet_speed()
# To detach web-driver from web browser (to prevent closing)
browser_options = Options()
browser_options.add_experimental_option("detach", False) # Calling 'add_experimental_option' method from Options class
# Internet Speed Adjustment
very_short_pause = 1 * t
short_pause = 2 * t
long_pause = 5 * t
# Setting path variable for webdriver and creating driver object
os.environ['PATH'] += r'E:/Coding/Projects/LinkedIn Job Scrapper/Selenium Driver' # Setting path of webdriver
driver = webdriver.Chrome(options=browser_options) # Setting which browser is to be automated
# Target Website
driver.maximize_window() # To maximize the web browser tab
driver.get("https://www.linkedin.com/jobs/search?keywords=&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0")
# Closing sign-in prompt
try:
close_sign_in_prompt = WebDriverWait(driver, 15).until(ec.element_to_be_clickable((By.XPATH, x.prompt_close())))
close_sign_in_prompt.click() # The click method clicks on the close_sign_in_prompt object
except (TimeoutException, NoSuchElementException):
pass
# SUBMITTING USER INPUT
# -- Entering Title input
job_title_area = driver.find_element(By.XPATH, x.title_box())
job_title_area.send_keys(job_role) # Entering user input for Job Title
time.sleep(short_pause)
# --Entering Location Input
location_area = driver.find_element(By.XPATH, x.location_box())
location_area.click() # Clicking on input box so that cross button becomes clickable for next line of code
location_area.clear()
time.sleep(short_pause) # Pausing to mitigate bot detetection
location_area.send_keys(job_location) # Entering user input for Location
time.sleep(short_pause)
search_icon = driver.find_element(By.XPATH, x.search())
search_icon.click() # Clicking on search icon to submit user input
time.sleep(long_pause) # Wait time for site reload after submission
# -- Selecting the job listing time
# ----finding location of Date posted selection filter button from filter menu
match = ['Past 24 hours', 'Past week', 'Past month', 'Any time'] # To match the xpath texts
listing_time = 'Null'
filter_pos = 1 # Storing positing to use it in xpath for selection of user input
for pos in range(1, 7): # Looping to match xpath text of each available filter with match array
dummy_time = WebDriverWait(driver, 30).until(ec.element_to_be_clickable((By.XPATH, x.post_time(pos))))
if dummy_time.text.strip().capitalize() in match:
listing_time = dummy_time
break
else:
filter_pos += 1
listing_time.click() # Clicking on time selection filter to expand option to choose from
time.sleep(short_pause)
# -- selecting user input
desired_listing_time = 'Null'
select = 'Null'
networkError = True
availableFilters = []
for s in range(1, 5):
try:
select = WebDriverWait(driver, 30).until((ec.element_to_be_clickable((By.XPATH, x.time_selection(filter_pos, s)))))
networkError=False
if select.text.split('(')[0].strip().capitalize() == job_post_time.strip():
desired_listing_time = select
break
else:
availableFilters.append(select.text.split('(')[0].strip().capitalize())
select = 'Null'
# NOTE: Split function is used as the text are of the form 'Any Time (24,458)'
except TimeoutException:
pass
if select!='Null':
desired_listing_time.click() # Clicking on desired option
time.sleep(short_pause)
time_done_button = driver.find_element(By.XPATH, x.time_done(filter_pos))
time_done_button.click() # Clicking on the done button
elif networkError == False:
print(f'Selected job listing time filter is not availabe for the provided Job Title Keyword and Location. Availble Listing Times: {availableFilters}')
print(f'Please run the script again and select other job listing time or use appropraite Keyword and Location')
exit()
elif networkError == True:
print(f'Network problems. Please check your internet connection and run the script again')
exit()
# Advanced Filtering on
def deep_filter():
try:
show_more = WebDriverWait(driver, 30).until((ec.element_to_be_clickable((By.XPATH, x.show()))))
show_more.click()
job_description = driver.find_element(By.XPATH, x.description()).text
if len(job_description) == 0:
return True
else:
return af.advanced_filter(job_description, maximum_months_of_experience)
except TimeoutException:
return False
except StaleElementReferenceException:
time.sleep(long_pause)
deep_filter()
# Required data:
company_name = []
job_title = []
location = []
experience_level = []
link = []
# Function to add entry in csv
def add_job_detail():
company_name.append(driver.find_element(By.XPATH, x.company()).text)
job_title.append(driver.find_element(By.XPATH, x.title()).text)
location.append(driver.find_element(By.XPATH, x.location()).text)
experience_level.append(exp_level)
link.append(job.get_attribute("href"))
# Accessing the jobs title from right pane
i = 1
unexpected_error = 0 # To avoid infinite loop if it continuously goes in except block
while len(company_name) < no_of_jobs: # Number of Jobs to iterate
try:
job = WebDriverWait(driver, 15).until((ec.element_to_be_clickable((By.XPATH, x.jobs(i)))))
job.click() # Clicking on each job title to open description on right pane
# Function to click on previous job in case side pane of current selected job doesn't load up
def timeout_recursion():
limit = 1
while limit <= long_pause: # Increase loop value in case of slow network or increase the wait time in except block
try:
xp = WebDriverWait(driver, 5).until((ec.element_to_be_clickable((By.XPATH, x.level()))))
return xp.text
except (TimeoutException, StaleElementReferenceException):
if i != 1:
prev_job = driver.find_element(By.XPATH, x.jobs(i - 1))
else:
prev_job = driver.find_element(By.XPATH, x.jobs(i + 1))
prev_job.click()
time.sleep(very_short_pause)
job.click()
time.sleep(very_short_pause)
limit += 1
if limit == long_pause:
print('Poor network connection. Unable to scan one job detail. Bypassing...')
return 'Null'
continue
# To find the experience level of the listed job
try:
exp_level = timeout_recursion()
except TimeoutException:
exp_level = 'Null'
print('Poor network connection. Unable to scan one job detail. Bypassing...')
# Matching desired criteria
if exp_level.strip().capitalize() in desired_experience_level:
unexpected_error = 0
if advanced_filtering:
if deep_filter():
time.sleep(very_short_pause)
add_job_detail()
print(f'{len(company_name)} jobs found from {i} scanned jobs.')
else:
time.sleep(very_short_pause)
add_job_detail()
elif exp_level.strip().capitalize() not in experiences and 'Not applicable' in desired_experience_level and exp_level!= 'Null':
unexpected_error = 0
if advanced_filtering:
if deep_filter():
add_job_detail()
print(f'{len(company_name)} jobs found from {i} scanned jobs.')
else:
add_job_detail()
i += 1
except (NoSuchElementException, TimeoutException):
# Finding if the list has ended by checking existence of listing of job beyond current selection
try:
list_end_check = driver.find_element(By.XPATH, x.jobs(i+random.randint(5, 20)))
# In case this block execute it means list hasn't ended thus skipping current selection and increasing i
print(f'Unable to locate one Job Title, bypassing...')
i += 1
unexpected_error += 0.5
# In case immediate next listing cannot be located
except NoSuchElementException:
# Checking if the see more jobs button is available
try:
see_more_jobs = driver.find_element(By.XPATH, x.show_more())
see_more_jobs.click()
time.sleep(long_pause)
i += 1 # Increasing Value in case see more jobs button available and next listing not interactable
unexpected_error += 1 # Added in case see more jobs is interactable but doesn't load more jobs
# In case it is not interactable it means the list has ended
except ElementNotInteractableException:
print(f'Unable to locate one Job Title, bypassing...')
unexpected_error += 1
i += 1
# To break the loop in case of any unexpected infinite loop error
if unexpected_error >= 15:
break
print('All possible Job data scraped. Browser seesion will close now and data will be stored shortly')
# Exporting as csv file
df = pd.DataFrame({'Company Name': company_name,
'Job Title': job_title,
'Location': location,
'Experience Level': experience_level,
'Link': link})
csv_file = df.to_csv(index=False).encode('utf-8')
# To close the web browser at end of the session
driver.quit()
# To save Job Listing on local device (Comment out the line below (Line 288) if you are using mail system feature)
df.to_csv(f'Job Listings {dt.date.today()}.csv', index=False)
# To send Job Listing via mail (Uncomment the line below (Line 291) to enable this feature and set user_mail,passcode and recipent in mail_system.py file.)
# mail_system.mail_sub(csv_file)