-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_fetch.py
45 lines (39 loc) · 1.88 KB
/
data_fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# -*- coding: utf-8 -*-
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import codecs
DATASET_CO2_URL = (
"http://api.worldbank.org/v2/en/indicator/EN.ATM.CO2E.KT?downloadformat=csv"
)
DATASET_POP_URL = (
"http://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv"
)
def get_dataset_files():
""" fetches the .zip archives from each URL and unpacks the datasets and one country metadata file """
co2_resp = urlopen(DATASET_CO2_URL)
with ZipFile(BytesIO(co2_resp.read())) as zipfile:
data_co2_filename = [x for x in zipfile.namelist() if x.startswith("API_EN")][0]
data_co2_file = zipfile.open(data_co2_filename)
for i in range(4):
# first four lines are useless, and WILL break the CSV reader, so skip them
data_co2_file.readline()
# we need this iterdecode, because csv.DictReader breaks if not given a file object
# however, this method of extracting the csv gives us a file in byte mode, which DictReader can't parse.
data_co2_file = codecs.iterdecode(data_co2_file, "utf-8")
meta_country_filename = [
x for x in zipfile.namelist() if x.startswith("Metadata_Country")
][0]
# this one is encoded in utf-8-sig for some reason, others seem to be utf-8
meta_country_file = codecs.iterdecode(
zipfile.open(meta_country_filename), "utf-8-sig"
)
pop_resp = urlopen(DATASET_POP_URL)
with ZipFile(BytesIO(pop_resp.read())) as zipfile:
data_pop_filename = [x for x in zipfile.namelist() if x.startswith("API_SP")][0]
data_pop_file = zipfile.open(data_pop_filename)
for i in range(4):
# same first four line thing here
data_pop_file.readline()
data_pop_file = codecs.iterdecode(data_pop_file, "utf-8")
return data_pop_file, data_co2_file, meta_country_file