-
Notifications
You must be signed in to change notification settings - Fork 0
/
419_email_Scraping.py
71 lines (67 loc) · 2.75 KB
/
419_email_Scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 6 00:39:20 2023
@author: Lydia
"""
## Scrape the 419 scam emails from website "https://www.419scam.org/emails/index.htm"
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.419scam.org/emails/index.htm"
response = requests.get(url)
urls=[]
if response.status_code == 200:
with open('urls.txt', 'w') as f:
f.write(response.text)
soup = BeautifulSoup(response.text, "html.parser")
dates = soup.find_all('a', href=lambda href: href and "419" not in href and "20" in href)
for i in dates:
href=i.get("href")
if len(href) > 17:
url = "https://www.419scam.org/emails/" + href
#print(url)
urls.append(url)
scam={}
dfBuilder = []
for url in urls:
if "2019-02" not in url: #Change this text in demo
continue
else:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
contents = soup.find_all('a', href=lambda href: href and href[0].isdigit())
for i in contents:
href=i.get("href")
email=url[:-9]+href
print(email)
response = requests.get(email)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
content = soup.find_all("blockquote")
d={}
msg = ""
los=str(content).split("<br/>")
print(content)
try:
for s in los:
if "Reply-To" in s:
extract = s.split("</b>")[1]
if ";" in extract:
extract = extract.split(";")[1][:-3]
d["Sender"] = extract
elif "Subject" in s:
d["Subject"] = s.split("</b>")[1][1:]
elif "<b>" not in s and "blockquote" not in s:
msg+=s
d["content"] = msg
scam[s]= d
dfBuilder.append([d["Sender"],d["Subject"], d["content"]])
except:
pass
else:
print("Failed to retrieve the webpage. Status code:", response.status_code)
else:
print("Failed to retrieve the webpage. Status code:", response.status_code)
df = pd.DataFrame(dfBuilder, columns=["Sender", "Subject", "Content"])
#df.to_csv("419_2019_02.csv")