-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
45 lines (38 loc) · 1.51 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# importing the libraries
from bs4 import BeautifulSoup
import requests
#Not all the programs, but most.
programs=['A', 'B', 'BI', 'BME', 'C', 'D', 'E', 'F', 'I', 'K', 'L', 'N', 'M', 'MD', 'V', 'W', 'Pi']
#Url for 19/20 Courses
url='https://kurser.lth.se/lot/?lasar=19_20&val=program&prog='
for program in programs:
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url+program).content
# Parse the html content
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the raw parsed data of html
#Find all course tables using the HTML class that is currently used for those tables
tb = soup.find_all('table', class_='CourseListView border hover lighter_table_head zebra')
#Get the different grouping categories for the tables, i.e. year 1, specialization X..
courseGrouping = []
for headerText in soup.find_all('h3'):
groupingName = headerText.text.replace('\n', ' ').strip()
courseGrouping.append(groupingName)
print(groupingName)
#Find all the coursedata and store it in an array
courseData = []
i = 0
for table in tb:
for row in table.find_all('tr'):
for cell in row.find_all('td'):
name = cell.text.replace('\n', ' ').strip()
courseData.append(name)
courseData.append(courseGrouping[i])
courseData.append('End of row')
i += 1
fileName='scraperData/html_data_'+program+'.txt'
#Output to textfile
with open(fileName, 'w') as filehandler:
for listitem in courseData:
filehandler.write('%s\n' % listitem)
filehandler.close()