-
Notifications
You must be signed in to change notification settings - Fork 0
/
AHA-selenium.py
107 lines (88 loc) · 3.21 KB
/
AHA-selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from selenium import webdriver # powers the browser interaction
from selenium.webdriver.support.ui import Select # selects menu options
from bs4 import BeautifulSoup # to parse HTML
import csv # to write CSV
import pandas as pd # to see CSV
import time
import os
import random
import requests
driver = webdriver.Chrome()
driver.get("http://careers.historians.org/jobs/?page=1")
base_url = 'http://careers.historians.org'
all_rows = []
pages = ["http://careers.historians.org/jobs/?page=1",
"http://careers.historians.org/jobs/?page=2"]
for p in pages:
driver.get(p)
soup = BeautifulSoup(driver.page_source, 'html5lib')
rows = soup.find_all('div', {'class': 'bti-ui-job-detail-container'})
for r in rows:
title = r.find('a').text.strip()
link = base_url + r.find('a')['href']
employer = r.find(
'div', {
'class': 'bti-ui-job-result-detail-employer'}).text.strip()
location = r.find(
'div', {
'class': 'bti-ui-job-result-detail-location'}).text.strip()
date_posted = r.find(
'div', {
'class': 'bti-ui-job-result-detail-age'}).text.strip()
driver.get(link)
soup = BeautifulSoup(driver.page_source, 'html5lib')
try:
job_description = soup.find(
'div', {'class': 'bti-jd-description'}).text.strip()
details = soup.find('div', {'class': 'bti-jd-details-container'})
details_titles = [
x.text.replace(
':', '').lower().strip() for x in details.find_all(
'div', {
'class': 'bti-jd-detail-title'})]
details_text = [
x.text.strip() for x in details.find_all(
'div', {
'class': 'bti-jd-detail-text'})]
details_dict = {}
for i in range(len(details_titles)):
t = details_titles[i]
if 'categories' in t:
t = 'category'
elif 'required' in t:
t = 'preferred education'
details_dict[t] = details_text[i]
details_dict['title'] = title
details_dict['link'] = link
details_dict['employer'] = employer
details_dict['location'] = location
details_dict['date_posted'] = date_posted
details_dict['job_description'] = job_description
try:
details_dict['employer_about'] = soup.find(
'div', {'class': 'bti-jd-employer-info'}).text.strip()
except:
details_dict['employer_about'] = ''
all_rows.append(details_dict)
except:
pass
time.sleep(1)
header = ["title",
"employer",
"location",
"posted",
"date_posted",
"primary field",
"category",
"preferred education",
"salary",
"type",
"employment type",
"job_description",
"employer_about",
"link"
]
with open('AHA-data.csv', 'w') as f:
w = csv.DictWriter(f, header)
w.writeheader()
w.writerows(all_rows)