-
Notifications
You must be signed in to change notification settings - Fork 3
/
get_train_dev_data.py
130 lines (110 loc) · 5.12 KB
/
get_train_dev_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
=========================================================
Title: ImageArg Shared Task Code - Dataset Downloader
---------------------------------------------------------
Warning: The dataset to download should only be used for
scientific or research purposes. Any other use is
explicitly prohibited. Any participants are not allowed
to redistribute the dataset per Twitter Developer Policy:
https://developer.twitter.com/en/developer-terms/policy.
---------------------------------------------------------
Notice: This code is managed by ImageArg Shared Task
(https://imagearg.github.io/). Parts of the code are
adopted from https://github.com/markowanga/stweet.
---------------------------------------------------------
Data: 2023-10-05
=========================================================
"""
from tqdm import tqdm
import pandas as pd
import stweet as st
import requests
import argparse
import json
import os
def try_tweet_by_id_scrap(tweet_id):
id_task = st.TweetsByIdTask(tweet_id)
output_print = st.PrintRawOutput()
output_collector = st.CollectorRawOutput()
st.TweetsByIdRunner(tweets_by_id_task=id_task,
raw_data_outputs=[output_print, output_collector]).run()
return output_collector
def run(topic="abortion"):
if not os.path.exists(args.data_dir):
os.mkdir(args.data_dir)
if not os.path.exists(os.path.join(args.data_dir, "images")):
os.mkdir(os.path.join(args.data_dir, "images"))
if not os.path.exists(os.path.join(args.data_dir, "images", f"{topic}")):
os.mkdir(os.path.join(args.data_dir, "images", f"{topic}"))
text_exist_ids = []
df_train_exist = pd.DataFrame()
df_dev_exist = pd.DataFrame()
if os.path.exists(os.path.join(f"{args.data_dir}", f"{topic}_train.csv")):
df_train_exist = pd.read_csv(os.path.join(f"{args.data_dir}", f"{topic}_train.csv"))
text_exist_ids += df_train_exist["tweet_id"].tolist()
if os.path.exists(os.path.join(f"{args.data_dir}", f"{topic}_dev.csv")):
df_dev_exist = pd.read_csv(os.path.join(f"{args.data_dir}", f"{topic}_dev.csv"))
text_exist_ids += df_dev_exist["tweet_id"].tolist()
# if not os.path.exists(args.meta_data):
# raise "No meta data found! Please download meta data..."
# with open(args.meta_data) as f:
# lines = json.load(f)
headers = {'Accept': 'application/json'}
lines = requests.get(args.meta_data, headers=headers).json()
lines = [line for line in lines if line["topic"] == topic]
tweetid_list = []
tweeturl_list = []
tweettext_list = []
stance_list = []
persuasiveness_list = []
split_list = []
for item in tqdm(lines):
tweetid = item['tweet_id']
stance = item["stance"]
persuasiveness = item["persuasiveness"]
tweeturl = item["tweet_url"]
split = item["split"]
# save time in case the images are downloaded
image_exist_path = os.path.join(args.data_dir, "images", f"{topic}", f"{tweetid}.jpg")
if os.path.exists(image_exist_path) and (tweetid in text_exist_ids):
continue
try:
output_collector = try_tweet_by_id_scrap(item['tweet_id'])
jsonline = json.loads(output_collector.get_raw_list()[0].to_json_line())
text = jsonline['raw_value']['legacy']['full_text']
img_url = jsonline['raw_value']['legacy']['entities']['media'][0]['media_url_https']
img_data = requests.get(img_url, timeout=10).content
with open(os.path.join(args.data_dir, "images", f"{topic}", f"{tweetid}.jpg"), 'wb') as handler:
handler.write(img_data)
tweettext_list.append(text)
tweetid_list.append(tweetid)
tweeturl_list.append(tweeturl)
stance_list.append(stance)
persuasiveness_list.append(persuasiveness)
split_list.append(split)
df = pd.DataFrame()
df['tweet_id'] = tweetid_list
df["tweet_url"] = tweeturl_list
df['tweet_text'] = tweettext_list
df['stance'] = stance_list
df["persuasiveness"] = persuasiveness_list
df["split"] = split_list
df_train = df[df["split"] == "train"]
df_dev = df[df["split"] == "dev"]
if len(df_train_exist) > 0:
df_train = pd.concat([df_train_exist, df_train])
if len(df_dev_exist) > 0:
df_dev = pd.concat([df_dev_exist, df_dev])
df_train.to_csv(os.path.join(f"{args.data_dir}", f"{topic}_train.csv"), index=False)
df_dev.to_csv(os.path.join(f"{args.data_dir}", f"{topic}_dev.csv"), index=False)
except Exception as e:
print("Skip Error", e)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Persuasiveness')
parser.add_argument('--meta-data', default='https://people.cs.pitt.edu/~zhexiong/data/meta_data.json',
help='meta data path')
parser.add_argument('--data-dir', default='./data', help='path to save data')
args = parser.parse_args()
for topic in ["gun_control", "abortion"]:
run(topic=topic)
print("Done!")