-
Notifications
You must be signed in to change notification settings - Fork 0
/
reddit_scrape.py
69 lines (58 loc) · 2.9 KB
/
reddit_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from datetime import datetime
from time import sleep
import requests
import csv
import json
def epoch_time(time: datetime):
t0 = datetime(1970,1,1)
return (time-t0).total_seconds()
def scrape(lower_time, upper_time):
sleep(.3) # Rate limiting issues
url = f"https://api.pushshift.io/reddit/search/submission/?subreddit=uiuc&after={lower_time}&before={upper_time}&sort_type=created_utc&sort=asc&fields=created_utc,link_flair_text,title,full_link&size=100"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
data: list = data["data"] # JSON is formatted wierd....
if (len(data) == 100):
# That means there might be more posts. Trigger another scrape, append to this scrape (this could happend multiple times recursively)
last_timestamp = data[-1]["created_utc"]
data += scrape(last_timestamp, upper_time)
return data
else:
print(f"Error: {response.status_code}")
if (response.status_code == 429):
# We were rate limited :(
# Let's just try again
return scrape(lower_time+1, upper_time)
SECONDS_PER_DAY = 86400
FIRST_DAY = datetime(2020, 8, 1)
LAST_DAY = datetime(2021, 2, 23)
DATA_DIR = "./reddit-data"
count_result_lines = []
example_result_lines = []
example_json_data = {}
# Loop through each day
prev_timestamp = None
for timestamp in range(int(epoch_time(FIRST_DAY)), int(epoch_time(LAST_DAY)) + SECONDS_PER_DAY, SECONDS_PER_DAY):
if prev_timestamp is None:
prev_timestamp = timestamp
continue
# Get the data
data = scrape(prev_timestamp+1, timestamp)
filtered_covid_posts = [post for post in data if ("link_flair_text" in post and post["link_flair_text"] == "COVID-19")]
# Getup to 3 examples
num_examples = min(len(filtered_covid_posts), 3)
example_json_data[prev_timestamp] = ([[post["title"], post["full_link"]] for post in filtered_covid_posts[:num_examples]])
example_result_lines += ([[prev_timestamp, post["title"], post["full_link"]] for post in filtered_covid_posts[:num_examples]])
count_result_lines.append([prev_timestamp, len(data), len(filtered_covid_posts)])
print(count_result_lines[-1])
prev_timestamp = timestamp
# Save to csv
with open(f"{DATA_DIR}/timestamped_post_count_data.csv", newline='', mode='w', encoding="utf-8") as data_file:
writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerows(count_result_lines)
with open(f"{DATA_DIR}/timestamped_example_post_data.csv", newline='', mode='w', encoding="utf-8") as data_file:
writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerows(example_result_lines)
with open(f"{DATA_DIR}/timestamped_example_post_data.json", newline='', mode='w', encoding="utf-8") as data_file:
json.dump(example_json_data, data_file, indent=4)