-
Notifications
You must be signed in to change notification settings - Fork 34
/
subset-geolocated-tweets.py
79 lines (67 loc) · 2.52 KB
/
subset-geolocated-tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
'''
subset-geolocated-tweets.py
Takes a large file with geolocated tweets and returns another file with
tweets sent from within a given bounding box
@p_barbera
Usage:
### extract retweets sent from Scotland
python subset-geolocated-tweets.py -f 'geotweets.json' -o 'scotland-tweets.json' \
-swlat 54.184 -swlong -5.734 -nelat 59.03 -nelong 1.120
Note: it extracts information from BOTH geolocated tweets (with coordinates)
and tweets with 'place' information (in that case, it returns centroid of bounding
box from place)
'''
import sys
import json
import argparse
# arguments
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', required=True,
help = 'name of file with tweets in json format')
parser.add_argument('-o', '--output', required=True,
help = 'name of file where subset of tweets will be stored')
parser.add_argument('-swlat', required=True, type=float,
help = 'south west corner of bounding box (latitude)')
parser.add_argument('-swlong', required=True, type=float,
help = 'south west corner of bounding box (longitude)')
parser.add_argument('-nelat', required=True, type=float,
help = 'north east corner of bounding box (latitude)')
parser.add_argument('-nelong', required=True, type=float,
help = 'north east corner of bounding box (longitude)')
args = parser.parse_args()
# function to subset file
def parse_file(filename):
i = 0
f = open(filename, 'r')
tweets = []
for line in f:
i += 1
if i % 10000 == 0:
print str(i) + ' tweets processed'
try:
t = json.loads(line)
except:
print 'Error parsing json'
continue
try:
lat = t['geo']['coordinates'][0]
lon = t['geo']['coordinates'][1]
except:
try:
lon = float(t['place']['bounding_box']['coordinates'][0][0][0] +
t['place']['bounding_box']['coordinates'][0][2][0]) / 2
lat = float(t['place']['bounding_box']['coordinates'][0][0][1] +
t['place']['bounding_box']['coordinates'][0][1][1]) / 2
except:
print 'Error extracting coordinates'
continue
if lat > args.swlat and lon > args.swlong and lat < args.nelat and lon < args.nelong:
tweets.append(t)
return(tweets)
# subsetting tweets
tweets = parse_file(args.file)
print str(len(tweets)) + ' tweets in bounding box'
out = open(args.output, 'w')
for tweet in tweets:
out.write(json.dumps(tweet) + '\n')
out.close()