-
Notifications
You must be signed in to change notification settings - Fork 0
/
Reddit_functions.py
112 lines (86 loc) · 4.59 KB
/
Reddit_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import requests
import datetime
from anytree import Node
from functions import *
def getNestedComments(replies, root_node, convo_dict):
for i in range(len(replies)):
reply = replies[i]
if 'created_utc' in reply['data'] and 'parent_id' in reply['data'] and 'body' in reply['data'] and '[deleted]' not in reply['data']['body']:
print('User:', reply['data']['id'])
print('Time:', datetime.datetime.fromtimestamp(reply['data']['created_utc']))
print('In reply to:', reply['data']['parent_id'][3:])
print(reply['data']['body'], '\n')
convo_dict['user_name'].append(reply['data']['author'])
convo_dict['id'].append(reply['data']['id'])
convo_dict['timestamp'].append(datetime.datetime.fromtimestamp(reply['data']['created_utc']))
convo_dict['reply_to'].append(reply['data']['parent_id'][3:])
convo_dict['comment'].append(reply['data']['body'])
child = Node(reply['data']['id'], parent=root_node)
if 'replies' in reply['data'] and reply['data']['replies'] != '':
getNestedComments(reply['data']['replies']['data']['children'], child, convo_dict)
else:
return
def createTree(res, post_id):
root = Node(post_id)
comments = res.json()[1]['data']['children']
conversation_dict = {'user_name':[], 'id':[], 'timestamp':[], 'reply_to':[], 'comment':[]}
print('-----About the post-----')
print('No. of Upvotes:', res.json()[0]['data']['children'][0]['data']['ups'])
print('Upvote Ratio:', res.json()[0]['data']['children'][0]['data']['upvote_ratio'], '\n')
for i in range(len(comments)):
comment = comments[i]
try:
if '[deleted]' not in comment['data']['body']:
print('User:', comment['data']['id'],
'Time:', datetime.datetime.fromtimestamp(comment['data']['created_utc']))
print('In reply to:', comment['data']['parent_id'][3:])
print(comment['data']['body'], '\n')
conversation_dict['user_name'].append(comment['data']['author'])
conversation_dict['id'].append(comment['data']['id'])
conversation_dict['timestamp'].append(datetime.datetime.fromtimestamp(comment['data']['created_utc']))
conversation_dict['reply_to'].append(comment['data']['parent_id'][3:])
conversation_dict['comment'].append(comment['data']['body'])
child = Node(comment['data']['id'], parent=root)
if comment['data']['replies'] != '':
replies = comment['data']['replies']['data']['children']
getNestedComments(replies, child, conversation_dict)
except:
continue
return conversation_dict, root
def getRedditPosts(subreddit):
all_posts = requests.get(f'https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&sort=desc&sort_type=created_utc&size=50')
for i in range(len(all_posts.json()['data'])):
post = all_posts.json()['data'][i]
print('Post ID:', post['id'],
'Time:', datetime.datetime.fromtimestamp(post['created_utc']))
print(post['title'], '\n')
# command = (
# '''
# INSERT INTO reddit_data
# VALUES ('%s', '%s', '%s');
# ''' % (post['id'], datetime.datetime.fromtimestamp(post['created_utc']),
# post['title'])
# )
# setUpDB(command, uri)
def processRedditDataframe(conversation_dict, post_id):
df = pd.DataFrame.from_dict(conversation_dict)
df['head_id'] = post_id
df['social_media'] = 'Reddit'
query = ('''
select * from reddit_posts_for_analysis where head_id = '%s'
''' % df['head_id'][0])
data = getData(query, uri)
if len(data) == 0:
for index, row in df.iterrows():
command = (
'''
INSERT INTO reddit_posts_for_analysis
VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s');
''' % (row['id'], row['user_name'], row['timestamp'], row['reply_to'], row['comment'].replace("'", "''"), row['social_media'], row['head_id'])
)
setUpDB(command, uri)
else:
print('Conversation has been updated in the database.')
df['url'] = df['comment'].apply(lambda x: getLinks(x))
df['link_title'] = df['url'].apply(lambda x: getURLfromList(x))
return df