Skip to content

Commit

Permalink
Code for Scaping Python blog page
Browse files Browse the repository at this point in the history
  • Loading branch information
MrPrajwal12 committed Jul 3, 2024
1 parent 475ef13 commit c030f12
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 0 deletions.
12 changes: 12 additions & 0 deletions SJEC_session1_CS106_Prajwal/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM python:3.10.2-alpine3.15
# Create directories
RUN mkdir -p /root/workspace/src
COPY ./python_web_scrape.py /root/workspace/src
# Switch to project directory
WORKDIR /root/workspace/src
# Install required packages
RUN pip install --upgrade pip
RUN pip install requests bs4 html5lib
RUN pip install psycopg2-binary
CMD ["python_web_scrape.py"]
ENTRYPOINT ["python"]
9 changes: 9 additions & 0 deletions SJEC_session1_CS106_Prajwal/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
psql-db:
image: 'postgres:14'
container_name: psql-db
environment:
- PGPASSWORD=123456
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=123456
ports:
- '5434:5432'
121 changes: 121 additions & 0 deletions SJEC_session1_CS106_Prajwal/python_web_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import requests
from bs4 import BeautifulSoup
import re
import psycopg2
from psycopg2 import Error

url = 'https://blog.python.org/'

def create_connection(db_name, db_user, db_password, db_host, db_port):
try:
connection = psycopg2.connect(
database=db_name,
user=db_user,
password=db_password,
host=db_host,
port=db_port
)
print("Connection to PostgreSQL DB successful")
return connection
except Error as e:
print(f"The error '{e}' occurred")
return None

def execute_query(connection, data):
cursor = connection.cursor()
try:
query = """
INSERT INTO python_blog_articles (date, title, body, author)
VALUES (%s, %s, %s, %s)
"""
cursor.execute(query, data)
connection.commit()
print("Query executed successfully")
except Error as e:
print(f"The error '{e}' occurred")

def create_table(connection):
cursor = connection.cursor()
try:
create_table_query = """
CREATE TABLE IF NOT EXISTS python_blog_articles (
id SERIAL PRIMARY KEY,
date VARCHAR(100),
title TEXT,
body TEXT,
author VARCHAR(100)
);
"""
cursor.execute(create_table_query)
connection.commit()
print("Table created successfully or already exists")
except Error as e:
print(f"The error '{e}' occurred")

def process_page(soup, date, titletext, bodytext, author):
for div in soup.find_all('div', class_='date-outer'):
date_header = div.find('h2', class_='date-header')
if date_header:
date_text = date_header.find('span').get_text(strip=True)
date.append(date_text)

for post in div.find_all('div', class_='post-outer'):
title_head = post.find('h3', class_='post-title entry-title')
if title_head:
titletext.append(title_head.text.strip())

content_div = post.find('div', class_='post-body entry-content')
if content_div:
paragraph_text = ' '.join([p.text.strip() for p in content_div.find_all('p')])
bodytext.append(paragraph_text)

footer_head = post.find('div', class_='post-footer')
if footer_head:
footer_text = footer_head.find('span', class_='post-author vcard').text.strip()
author.append(footer_text)

def main():
db_name = 'webdemo'
db_user = 'postgres'
db_password = '123456'
db_host = 'localhost'
db_port = '5434'

connection = create_connection(db_name, db_user, db_password, db_host, db_port)

if connection:
try:
date = []
titletext = []
bodytext = []
author = []

res = requests.get(url)
soup = BeautifulSoup(res.content, 'html5lib')
process_page(soup, date, titletext, bodytext, author)

while len(titletext) < 50:
older_posts_link = soup.find('a', string=re.compile(r'Older Posts', re.IGNORECASE))
if older_posts_link:
next_page_url = older_posts_link['href']
res = requests.get(next_page_url)
soup = BeautifulSoup(res.content, 'html5lib')
process_page(soup, date, titletext, bodytext, author)
else:
break

create_table(connection)
for i in range(len(titletext)):
data = (date[i], titletext[i], bodytext[i], author[i])
execute_query(connection, data)

except Error as e:
print(f"Error: {e}")

finally:
if connection:
connection.close()
print("PostgreSQL connection is closed")

if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions SJEC_session1_CS106_Prajwal/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
psycopg2-binary==2.9.3
beautifulsoup4==4.11.1
requests==2.27.1
html5lib==1.1

0 comments on commit c030f12

Please sign in to comment.