-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
54 lines (43 loc) · 1.89 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import os
import requests
import sys
import pandas as pd
import io
import scraper_helpers
import argparse
parser = argparse.ArgumentParser(description="Scrape South African dam water level data from the Internet Archive")
parser.add_argument("--output", "-o", default="dam_data", help="directory where the CSVs will go")
parser.add_argument("--provinces", "-p", nargs='+', help="ids of the specific provinces to download")
parser.add_argument("--no_override", "-n", action='store_true', help="will not redownload data that is already present")
args = parser.parse_args()
if not os.path.exists(args.output): # create the output folder if it doesn't exist
os.mkdir(args.output)
# hardcoding the provinces because those aren't changing anytime soon
provinces = {"EC":"Eastern_Cape",
"FS":"Free_State",
"G":"Gauteng",
"KN":"KwaZulu_Natal",
"LP":"Limpopo",
"M":"Mpumalanga",
"NC":"Northern_Cape",
"NW":"North_West",
"WC":"Western_Cape"}
# go through and check that the provinces we want exist in the dict and filter for the ones we want
if not args.provinces == None:
new_provs = dict()
for p in args.provinces:
if p not in provinces:
raise ValueError("invalid province id")
else:
new_provs[p] = provinces[p]
provinces = new_provs
for p, name in provinces.items():
csvPath = os.path.join(args.output, name + "_dam_levels.csv")
if os.path.isfile(csvPath) and args.no_override:
continue
provinceData = list()
for pair in scraper_helpers.getArchivedURLs(p):
provinceData = provinceData + scraper_helpers.getTableData(pair[1], pair[0])
df = pd.DataFrame(data=provinceData, columns=["Date", "Dam", "River", "FSC", "This week", "Last Week", "Last Year"])
df.to_csv(csvPath, sep=",", index=False)
print("Exported data to " + csvPath)