-
Notifications
You must be signed in to change notification settings - Fork 2
/
filter_youtube_data.py
54 lines (48 loc) · 2.11 KB
/
filter_youtube_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import codecs, csv, re, json, os
#Filters YT comments by terms in dictionary.csv and outputs a new csv with only comments mentioning the wanted terms
wanted_terms = []
termdic={}
with codecs.open('dictionary.csv','r', encoding='utf-8') as f:
r=csv.DictReader(f, delimiter=';')
for row in r:
wanted_terms.append(row['term'])
termdic[term] = [row['agenda'], row['discourse']
columns=['platform','day-month-year-hour','post containing anti-un term','agenda','anti-un term','original un fb, yt or tw post','username (wanted_termstag value)','user category (authentic/inauthentic)','mentions','retweets','likes/loves','account still up?']
out=[]
for x in os.listdir('../comments'):
vid=x[:11]
dta = json.load(open('../comments/%s'%x,'r'))
print(dta.keys())
if 'items' in dta.keys():
for itm in dta['items']:
itm=itm['snippet']
itm=itm['topLevelComment']
itm=itm['snippet']
txt=itm['textOriginal']
founds=[]
for term in wanted_terms:
test = re.search(r'\b%s\b' % term.lower(), txt.lower())
if test:
founds.append(term)
if len(founds)>0:
dic={}
for z in columns:
dic[z] = None
dic['platform'] = 'Youtube'
dic['day-month-year-hour'] = itm['publishedAt']
dic['post containing anti-un term'] = txt
dic['anti-un term'] = ','.join(founds)
dic['agenda'] = []
for ff in founds:
dic['agenda'].append(termdic[ff][0])
dic['agenda'] = ','.join(dic['agenda'])
dic['discourse'] = []
for ff in founds:
dic['discourse'].append(termdic[ff][1])
dic['discourse'] = ','.join(dic['discourse'])
dic['likes/loves'] = itm['likeCount']
out.append(dic)
with codecs.open('youtube_comments.csv','w', encoding='utf-8') as f:
w=csv.DictWriter(f, delimiter=';', fieldnames=columns)
w.writeheader()
w.writerows(out)