-
Notifications
You must be signed in to change notification settings - Fork 1
/
location_parse.py
90 lines (78 loc) · 3.98 KB
/
location_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
from lxml import etree
import csv
if __name__ == '__main__':
rootdir = '/home/alex/git/IATI-Registry-Refresher/data' # Your IATI XML repository here
activities_file = '/home/alex/git/IATI-VCE-Chad-Geo/activities_of_interest.csv' # List of IATI identifiers
with open(activities_file, 'r') as act_file:
acts = [act.strip() for act in act_file.readlines()]
header = ["iati_identifier", "lat", "long", "code", "level", "vocabulary", "name"]
output = list()
for subdir, dirs, files in os.walk(rootdir):
for filename in files:
filepath = os.path.join(subdir, filename)
print(filename)
try:
context = etree.iterparse(filepath, tag='iati-activity', huge_tree=True)
for _, activity in context:
identifiers = activity.xpath("iati-identifier/text()")
if identifiers:
identifier = identifiers[0].strip()
if identifier in acts:
locations = activity.xpath("location")
for location in locations:
admins = location.xpath("administrative")
for admin in admins:
admin_row = [
identifier,
"",
"",
admin.attrib.get("code", ""),
admin.attrib.get("level", ""),
admin.attrib.get("vocabulary", ""),
""
]
output.append(admin_row)
points = location.xpath("point/pos/text()")
for point in points:
try:
point_row = [
identifier,
point.replace("\xa0", " ").strip().split(" ")[0],
point.replace("\xa0", " ").strip().split(" ")[1],
"",
"",
"",
""
]
output.append(point_row)
except IndexError:
pass
if not admins and not points:
location_names = location.xpath("name/narrative/text()")
for location_name in location_names:
name_row = [
identifier,
"",
"",
"",
"",
"",
location_name
]
output.append(name_row)
# Free memory
activity.clear()
for ancestor in activity.xpath('ancestor-or-self::*'):
while ancestor.getprevious() is not None:
try:
del ancestor.getparent()[0]
except TypeError:
break
del context
except etree.XMLSyntaxError:
continue
with open('location_data.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(output)