scrapping_coursera.py

# -*- coding: utf-8 -*-
"""scrapping_coursera.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1VUrzrnvtBcqlQ6OtaShGTbAuzlsttH9Z
"""

from bs4 import BeautifulSoup
import requests

# importing the required libraries for scrapping purpose.

response = requests.get("https://www.coursera.org/courses")
html_soup = BeautifulSoup(response.content, 'html.parser')

# we have set the url to scrap and using get method we send a request and then using html.parser we parsed the response content with help of scrapping library called beautiful soup.


url = html_soup.find_all(href=True)

#find all the URLs (items in the html where href exists)


def auto_Scrapper(html_tag,course_case):
  for i in range(1,100):
    url = "https://www.coursera.org/courses?page=" +str(i) + "&index=prod_all_products_term_optimization"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    for j in range(0,9):
      x = soup.find_all(html_tag)[j].get_text()
      course_case.append(x)
        
# the function auto_Scrapper is used to get two parameters that is the tag and what to scrap and get the content scrapped.      


def auto_Scrapper_Class(html_tag,course_case,tag_class):
  for i in range(1,100):
    url = "https://www.coursera.org/courses?page=" +str(i) + "&index=prod_all_products_term_optimization"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    for j in range(0,9):
        x = soup.find_all(html_tag, class_ = tag_class)[j].get_text()
        course_case.append(x)
        
# the function auto_Scrapper_Class is used to get three parameters that is the tag,what to scrap and get the content scrapped and class it belongs. 

course_title = []
course_organization = []
course_Certificate_type = []
course_rating = []
course_difficulty = []
course_students_enrolled = []

# making an empty list so that we can append each of them at the end into a list for making dataframe.


auto_Scrapper('h2',course_title)
auto_Scrapper_Class('span',course_organization,'partner-name m-b-1s')
auto_Scrapper_Class('div',course_Certificate_type,'_jen3vs _1d8rgfy3')
auto_Scrapper_Class('span',course_rating,'ratings-text')
auto_Scrapper_Class('span',course_difficulty,'difficulty')
auto_Scrapper_Class('span',course_students_enrolled,'enrollment-number')

# here we are creating the lists of all data required and appending them to list.

import pandas as pd
courses_df = pd.DataFrame({'course_title': course_title,
                          'course_organization': course_organization,
                          'course_Certificate_type': course_Certificate_type,
                          'course_rating':course_rating,
                           'course_difficulty':course_difficulty,
                           'course_students_enrolled':course_students_enrolled})
courses_df = courses_df.sort_values('course_title')

# here we take each lists we generated by scrapping and made a dataframe out of it isung pandas library.


courses_df.to_csv('UCoursera_Courses.csv')

# At end we convert it into a csv file so we can use it for our data analysis part.