-
Notifications
You must be signed in to change notification settings - Fork 106
/
ticker_count_reddit.py
30 lines (26 loc) · 1.72 KB
/
ticker_count_reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import mysql.connector
import pandas as pd
## Connect to mysql server
mydb = mysql.connector.connect(
host ="",
user ="",
passwd = ""
)
mycursor = mydb.cursor()
df = pd.read_sql("select * from reddit_data.reddit_data_sentiment where date_time >= '2021-02-20 05:00:00' and date_time <= '2021-02-26 05:00:00' ", mydb) ## Put in the time period of your analysis
df_tickers = pd.read_csv('/home/nobjos/ticker_list_updated.csv')
dollar_sign = "$"
output = pd.DataFrame({"ticker":['sample'], "count":[0],"dollar_count":[0], "sentiment":[0]})
### There is multiple reasons why analysis is done in the following way
# You have check the number of times a ticker is present - a ticker can be present either by just ticker or after a dollar sign.
# Especially in reddit, in comments you very rarely see a dollar sign
# why we are calcualting the dollar sign is not to have false positives
# if there are a lot of mentions for the ticker but not succeeding a $ sign it might be false positive
# A lot of comments and tweets will only have zero sentiment which will pull down the average - so remove the zero sentiment data before calculating the sentiment
for ticker in df_tickers['Ticker']:
a = df['body'].str.contains(" " + ticker + " ", case=False).sum()
b = df['body'].str.contains(dollar_sign + ticker + " ", case=False, regex=False).sum()
c = df[(df['body'].str.contains(" " + ticker + " ", case=False)) & (df['sentiment'] !=0)]['sentiment'].mean()
df_one_ticker = pd.DataFrame({"ticker":[ticker], "count":[a], "dollar_count":[b], "sentiment":[c]})
output = output.append(df_one_ticker)
output.to_csv('/home/nobjos/feb20_26_reddit_count.csv') ##location where you want the analysis output to be stored