-
Notifications
You must be signed in to change notification settings - Fork 16
/
scrapping_coursera.py
83 lines (57 loc) · 3.1 KB
/
scrapping_coursera.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# -*- coding: utf-8 -*-
"""scrapping_coursera.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1VUrzrnvtBcqlQ6OtaShGTbAuzlsttH9Z
"""
from bs4 import BeautifulSoup
import requests
# importing the required libraries for scrapping purpose.
response = requests.get("https://www.coursera.org/courses")
html_soup = BeautifulSoup(response.content, 'html.parser')
# we have set the url to scrap and using get method we send a request and then using html.parser we parsed the response content with help of scrapping library called beautiful soup.
url = html_soup.find_all(href=True)
#find all the URLs (items in the html where href exists)
def auto_Scrapper(html_tag,course_case):
for i in range(1,100):
url = "https://www.coursera.org/courses?page=" +str(i) + "&index=prod_all_products_term_optimization"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
for j in range(0,9):
x = soup.find_all(html_tag)[j].get_text()
course_case.append(x)
# the function auto_Scrapper is used to get two parameters that is the tag and what to scrap and get the content scrapped.
def auto_Scrapper_Class(html_tag,course_case,tag_class):
for i in range(1,100):
url = "https://www.coursera.org/courses?page=" +str(i) + "&index=prod_all_products_term_optimization"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
for j in range(0,9):
x = soup.find_all(html_tag, class_ = tag_class)[j].get_text()
course_case.append(x)
# the function auto_Scrapper_Class is used to get three parameters that is the tag,what to scrap and get the content scrapped and class it belongs.
course_title = []
course_organization = []
course_Certificate_type = []
course_rating = []
course_difficulty = []
course_students_enrolled = []
# making an empty list so that we can append each of them at the end into a list for making dataframe.
auto_Scrapper('h2',course_title)
auto_Scrapper_Class('span',course_organization,'partner-name m-b-1s')
auto_Scrapper_Class('div',course_Certificate_type,'_jen3vs _1d8rgfy3')
auto_Scrapper_Class('span',course_rating,'ratings-text')
auto_Scrapper_Class('span',course_difficulty,'difficulty')
auto_Scrapper_Class('span',course_students_enrolled,'enrollment-number')
# here we are creating the lists of all data required and appending them to list.
import pandas as pd
courses_df = pd.DataFrame({'course_title': course_title,
'course_organization': course_organization,
'course_Certificate_type': course_Certificate_type,
'course_rating':course_rating,
'course_difficulty':course_difficulty,
'course_students_enrolled':course_students_enrolled})
courses_df = courses_df.sort_values('course_title')
# here we take each lists we generated by scrapping and made a dataframe out of it isung pandas library.
courses_df.to_csv('UCoursera_Courses.csv')
# At end we convert it into a csv file so we can use it for our data analysis part.