forked from maschinenmensch/edifice
-
Notifications
You must be signed in to change notification settings - Fork 0
/
setup_edifice.py
428 lines (357 loc) · 17.2 KB
/
setup_edifice.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
# coding: utf-8
import argparse
import os
import subprocess
from subprocess import call, check_call, Popen, CalledProcessError
import glob
from datasets import datasets
import sys
import re
import string
import zipfile
import csv
import httplib, json, psycopg2
import data_portal
EDIFICE_USER = 'edifice'
EDIFICE_DB = 'edifice'
POSTGRES_BINDIRNAME = None
POSTGRES_SUPERUSER = 'postgres'
POSTGRES_HOST = 'localhost'
DELETE_DOWNLOADS = False
# Start out with no psycopg2 connection, make it during data import ('--data')
DB_CONN = None
# Optional argument for directory location of pg_config
def get_postgres_version(bindir=None):
pgconfig_str = 'pg_config'
try:
if (bindir):
pgconfig = os.path.join(bindir, pgconfig_str)
cmd = '%s --version' % pgconfig_str
pg_config_out = Popen(cmd.split(), stdout=subprocess.PIPE)
line = pg_config_out.stdout.readline()
line = line.rstrip()
# regex match for, e.g., "PostgresSQL 9.0.11"
m= re.match('PostgreSQL (\d+)\\.?(\d+)\\.?(\\*|\d+)$', line)
try:
major_version = int(m.group(1))
minor_version = int(m.group(2))
if (major_version < 9):
print "PostgreSQL version 9.0.x or greater is required."
sys.exit(1)
else:
return (major_version, minor_version)
except IndexError as index_error:
print "Can't parse version string:", l
sys.exit(1)
except OSError as os_error:
raise
except CalledProcessError as e:
print e.output
sys.exit(1)
def get_postgres_database_list():
db_names = []
cmd = Popen('psql --list'.split(), stdout=subprocess.PIPE)
lines = cmd.stdout.readlines()[3:]
for line in lines:
line = line.strip()
line_split = line.split()
if (line_split[0] == '|' and (len(line_split[0]) == 1)): # assume there is no database titled '|'
return db_names
curr_db_name = line_split[0]
db_names.append(curr_db_name)
return db_names
def get_postgres_sharedir():
cmd = Popen('pg_config --sharedir', shell=True, stdout=subprocess.PIPE)
line = cmd.stdout.readline()
return line.rstrip()
def call_args_or_fail(cmd_args_list):
print ' '.join(cmd_args_list)
try:
retval = call(cmd_args_list,stderr=subprocess.STDOUT)
except CalledProcessError as e:
print e.output
sys.exit(1)
except KeyboardInterrupt:
print "\n" + "Ctrl-C'd by user"
sys.exit(1)
def call_or_fail(cmd, user=None, interactive=False, template=None, encoding=None, database=None, fname=None, sql_command=None, argument=None):
global POSTGRES_HOST
psql_split = cmd.split()
if (user):
psql_split.extend(["-U",user])
# add host flag automatically
psql_split.extend(["-h", POSTGRES_HOST])
if (interactive):
psql_split.append("--interactive")
if (template):
psql_split.extend(["-T",template])
if (encoding):
psql_split.extend(["-E",encoding])
if (database):
psql_split.extend(["-d",database])
if (fname):
psql_split.extend(["-f",fname])
if (sql_command):
psql_split.extend(["-c","%s" % sql_command])
if ((cmd == 'dropdb') or (cmd == 'createdb') or (cmd=='dropuser')):
if not argument:
print "Argument not specified in call to %s" % cmd
sys.exit(1)
else:
psql_split.append(argument)
call_args_or_fail(psql_split)
# This just takes a command without wrapping the args as in call_or_fail() above.
def call_raw_or_fail(cmd):
call_args_or_fail(cmd.split())
def process_data(dataset):
name = dataset[0]
domain = dataset[1]
data_type = dataset[2]
socrata_id = dataset[3]
options = dataset[4]
url = ''
# format: https://data.cityofchicago.org/download/5gv8-ktcg/application/zip
if (domain == 'Chicago'):
if data_type == 'shp':
url = "http://data.cityofchicago.org/download/%s/application/zip" % (socrata_id,)
if 'platform' in options and options['platform'] == 'mondara':
url = "https://data.cityofchicago.org/api/geospatial/%s?method=export&format=Shapefile" % (socrata_id,)
import_shp(name, url, options.setdefault('encoding',''))
elif data_type == 'json':
hostname = 'data.cityofchicago.org'
import_json(name, hostname, socrata_id, options.setdefault('encoding',''))
else:
print 'ERROR: unknown domain or data type', data_type, 'for', name
def import_json (name, hostname, socrata_id, options):
# Get the header information
conn = httplib.HTTPConnection(hostname)
conn.request('GET','/api/views/%s.json' % (socrata_id,))
r1 = conn.getresponse()
# Make sure it succeeded
assert r1.status==200
resp=json.loads(r1.read())
# Make sure it has columns
assert u'columns' in resp.keys()
# Start building the db command
db_command_args=[name]
db_command='CREATE TABLE %s ( '
# Loop through columns, adding appropriate arguments to db command.
for column in resp[u'columns']:
db_command=db_command+'%s %s,'
db_command_args.append(column[u'fieldName'])
field_type=column[u'dataTypeName']
if field_type==u'number':
if ('.' not in column[u'cachedContents'][u'smallest']) and \
('.' not in column[u'cachedContents'][u'largest']) and \
('.' not in column[u'cachedContents'][u'sum']):
db_command_args.append('integer')
else:
db_command_args.append('double precision')
elif field_type==u'calendar_date':
db_command_args.append('date')
elif field_type==u'checkbox':
db_command_args.append('boolean')
else:
db_command_args.append('text')
# Cut off the last comma and close off the command.
db_command=db_command[:-1]+');'
cur=DB_CONN.cursor()
cur.execute(db_command,db_command_args)
# Function to wget, unzip, shp2pgsql | psql, and rm in the subdirectory 'import/'
def import_shp (name, url, encoding):
global EDIFICE_USER
global EDIFICE_DB
global DELETE_DOWNLOADS
global DB_CONN
name_zip = "%s.zip" % os.path.join('downloads', name)
if not os.path.exists(name_zip) :
cmd_split = "wget --no-check-certificate -O".split()
cmd_split.append(name_zip)
cmd_split.append(url)
call_args_or_fail(cmd_split)
else :
print '%s exists, skipping download' % name_zip
try:
zip_file = zipfile.ZipFile(name_zip, 'r')
except zipfile.BadZipfile as e:
print "ERROR:", str(e)
print "Try removing file %s and running --data again." % name_zip
sys.exit(1)
first_bad_file = zip_file.testzip()
if (first_bad_file):
print "Error in %s: first bad file is %s" % name_zip, first_bad_file
sys.exit(1)
print 'extracting %s' % name_zip
zip_file_contents = zip_file.namelist()
for f in zip_file_contents:
zip_file.extract(f, 'import')
zip_file.close()
shapefile = None
for fname in glob.glob("import/*.shp"):
shapefile_name = fname
print 'Importing ', shapefile_name
if encoding:
encoding = '-W ' + encoding
shp2pgsql_cmd= 'shp2pgsql -d -s 3435 %s -g the_geom -I %s dataportal.%s' % (encoding, shapefile_name, name)
shp2pgsql_cmd_list = shp2pgsql_cmd.split()
psql_cmd = "psql -q -U %s -d %s" % (EDIFICE_USER, EDIFICE_DB)
p1 = Popen(shp2pgsql_cmd_list, stdout=subprocess.PIPE)
print shp2pgsql_cmd, "|", psql_cmd
p2 = Popen(psql_cmd.split(), stdin=p1.stdout, stdout=subprocess.PIPE)
stdout = p2.communicate()[0]
# Now do the specialized import for the given dataset
data_portal.do_import(name, DB_CONN)
# Great. Now delete all the files in zip_file_contents
for fname in glob.glob('import/*'):
print "Deleting:", fname
os.remove(fname)
if DELETE_DOWNLOADS:
print "deleting %s" % name_zip
os.remove(name_zip)
def main():
global EDIFICE_USER
global EDIFICE_DB
global POSTGRES_BINDIRNAME
global POSTGRES_SUPERUSER
global POSTGRES_HOST
global DELETE_DOWNLOADS
global DB_CONN
parser = argparse.ArgumentParser(description='Setup the PostGIS Edifice database and populate it with open datasets.')
parser.add_argument('--create_template', action='store_true',
help="Run only once to create a base postgis template ('base_postgis') as the postgres superuser")
parser.add_argument('--create', action='store_true',
help='Drop existing edifice database and create from scratch based on the base_postgis database created with --create_template')
parser.add_argument('--data', action='store_true',
help='Download and import City of Chicago data to edifice database (as listed in datasets.py)')
parser.add_argument('--bindir', nargs='?', type=str,
help='Directory location of PostgreSQL binaries (e.g. pg_config, psql)')
parser.add_argument('--user', nargs='?', type=str,
help="Postgres username for accessing edifice database (e.g. during --create or --data) [default: 'edifice']")
parser.add_argument('--superuser', nargs='?', type=str,
help="Postgres superuser name for creating edifice database (e.g. during --create_template) [default: 'postgres']")
parser.add_argument('--host', nargs='?', type=str,
help="Postgres host [default: 'localhost']")
parser.add_argument('--database', nargs='?', type=str,
help="Name for edifice database [default: 'edifice']")
parser.add_argument('--delete_downloads', action='store_true',
help="Keep files downloaded from the various data portals after they have been imported [default: True]")
args = parser.parse_args()
# Handle --bindir [directory w/ postgres binaries]
if args.bindir:
POSTGRES_BINDIRNAME = args.bindir
# Check if we can find pg_config in this directory. If this fails, we can't find it.
try:
(major_version, minor_version) = get_postgres_version(POSTGRES_BINDIRNAME)
except OSError as e:
print "Cannot find pg_config in specified directory: %s" % POSTGRES_BINDIRNAME
sys.exit(1)
# We know we have the right directory, let's just modify this process' PATH to have this directory first.
# Not exactly kosher but more straightforward than pasting POSTGRES_BINDIRNAME everywhere we exec 'psql'
os.environ['PATH'] = POSTGRES_BINDIRNAME + ":" + os.environ['PATH']
if args.user:
EDIFICE_USER = args.user
if args.superuser:
POSTGRES_SUPERUSER = args.superuser
if args.database:
EDIFICE_DB = args.database
if args.delete_downloads:
DELETE_DOWNLOADS = args.delete_downloads
# Handle --create_template [reconstruction of base_postgis template using the postgres superuser account]
if args.create_template:
try:
(major_version, minor_version) = get_postgres_version()
except OSError as e:
print "Cannot find pg_config. Please include the location of your pg_config binary in the flag --bindir."
sys.exit(1)
print 'Initializing postgres with a basic PostGIS template using the postgres superuser.'
# See if database base_postgis exists
db_names = get_postgres_database_list()
print "db_names is ", db_names
print "EDIFICE_DB is " ,EDIFICE_DB
if (EDIFICE_DB in db_names):
print("Deleting the MAIN edifice database in '%s'!" % EDIFICE_DB)
call_or_fail("dropdb",user=POSTGRES_SUPERUSER, interactive=True, argument=EDIFICE_DB)
# We should drop the edifice user (we will be recreating it and its roles).
#if (EDIFICE_USER != 'postgres'):
# # XXX: this could be potentially uncool if the client has a superuser not called 'postgres'?
# call_or_fail("dropuser", user=POSTGRES_SUPERUSER, interactive=True, argument=EDIFICE_USER)
if('base_postgis' in db_names):
# Make base_postgis deleteable
call_or_fail("psql", user=POSTGRES_SUPERUSER, database="postgres", sql_command="UPDATE pg_database SET datistemplate='false' WHERE datname='base_postgis';")
call_or_fail("dropdb", user=POSTGRES_SUPERUSER, interactive=True, argument="base_postgis")
# This could be template1, except I was having problems with my template1 being in ASCII explictly on a 9.0 install
call_or_fail("createdb", user=POSTGRES_SUPERUSER, template="template0", encoding="UTF8", argument= "base_postgis")
# Note: This doesn't seem necessary, as the templates in 9.0 and 9.2 seem to have this included.
# call_or_fail("createlang plpgsql base_postgis")
if (minor_version == 0):
# Get the sharedir from pg_config and verify the existence of postgis.sql and spatial_ref_sys.sql
share_dirname = get_postgres_sharedir()
print "share_dirname is " , share_dirname
postgis_fnames = []
postgis_basenames = ['postgis.sql', 'postgis_comments.sql', 'spatial_ref_sys.sql']
# Optional postgis scripts not currently included in base install:
# raster_comments.sql, rtpostgis.sql, topology.sql, topology_comments.sql
for postgis_basename in postgis_basenames:
fname = os.path.join(share_dirname, 'contrib/postgis-2.0/%s' % postgis_basename)
if (not os.path.isfile(fname)):
print "Can't find contrib/postgis-2.0/%s in %s: is PostGIS 2.0.x+ installed?" % (fname, share_dirname)
sys.exit(1)
else:
postgis_fnames.append(fname)
for fname in postgis_fnames:
call_or_fail('psql', user=POSTGRES_SUPERUSER, database="base_postgis", fname=fname)
call_or_fail("psql", user=POSTGRES_SUPERUSER, database="postgres", sql_command ="UPDATE pg_database SET datistemplate='true' WHERE datname='base_postgis';")
elif (minor_version >= 1):
# Instead of the above, just do 'CREATE EXTENSION postgis' if we are using 9.1 or later
call_or_fail("psql", user=POSTGRES_SUPERUSER,database="base_postgis", sql_command= "CREATE EXTENSION IF NOT EXISTS postgis WITH SCHEMA public;")
call_or_fail("psql", user=POSTGRES_SUPERUSER, database="postgres", sql_command="UPDATE pg_database SET datistemplate='true' WHERE datname='base_postgis';")
# Allow non-superusers to alter spatial tables
call_or_fail("psql", user=POSTGRES_SUPERUSER, database="base_postgis", sql_command="GRANT ALL ON geometry_columns TO PUBLIC;")
call_or_fail("psql", user=POSTGRES_SUPERUSER, database="base_postgis", sql_command="GRANT ALL ON geography_columns TO PUBLIC;")
call_or_fail("psql", user=POSTGRES_SUPERUSER, database="base_postgis", sql_command="GRANT ALL ON spatial_ref_sys TO PUBLIC;")
# Finally, give a user 'edifice' permission to alter the database with 'createdb' permission
# Note: This does not check to see if a 'edifice' user already exists.
call_or_fail("psql", user=POSTGRES_SUPERUSER, database="base_postgis", sql_command="CREATE USER %s;" % EDIFICE_USER)
call_or_fail("psql", user=POSTGRES_SUPERUSER, database="base_postgis", sql_command="ALTER USER %s createdb;" % EDIFICE_USER)
# Handle --create [reconstruction of edifice database using the edifice user account]
if args.create :
print 'Setting up edifice database from scratch.'
call_or_fail("dropdb", user=EDIFICE_USER, interactive=True, argument=EDIFICE_DB)
call_or_fail("createdb", user=EDIFICE_USER, template="base_postgis", argument=EDIFICE_DB)
# XXX: Is it not necessary to make edifice the owner of these tables inherited from base_postgis?
#call_or_fail("psql", user=POSTGRES_SUPERUSER, database=EDIFICE_DB, sql_command="ALTER TABLE geometry_columns OWNER TO edifice;")
#call_or_fail("psql", user=POSTGRES_SUPERUSER, database=EDIFICE_DB, sql_command="ALTER TABLE geography_columns OWNER TO edifice;")
#call_or_fail("psql", user=POSTGRES_SUPERUSER, database=EDIFICE_DB, sql_command="ALTER TABLE spatial_ref_sys OWNER TO edifice;")
call_or_fail("psql", user=EDIFICE_USER, database=EDIFICE_DB, fname="sql_init_scripts/pins_master.sql")
# Not sure why we were creating these specific tables from edifice_initialization_script in advance.
# call_or_fail("psql", user=EDIFICE_USER,database=EDIFICE_DB, fname="sql_init_scripts/assessed.sql")
call_or_fail("psql", user=EDIFICE_USER, database=EDIFICE_DB, fname="sql_init_scripts/edifice_initialization_script.sql")
call_or_fail("psql", user=EDIFICE_USER, database=EDIFICE_DB, sql_command="CREATE SCHEMA dataportal IF NOT EXISTS;")
if os.path.exists("downloads/pins.dump"):
print "Note: pins.dump already exists. Not fetching it."
else:
print 'Fetching pins.dump...'
call_args_or_fail("wget -O downloads/pins.dump http://dl.dropbox.com/u/14915791/pins.dump".split())
print "Loading property pins..."
call_raw_or_fail("pg_restore -U %s -h %s -O -c -d %s downloads/pins.dump" % (EDIFICE_USER, POSTGRES_HOST, EDIFICE_DB))
if (DELETE_DOWNLOADS):
os.remove('downloads/pins.dump')
if args.data:
# Connect to the db
print "Connecting to database '%s' with user '%s'" % (EDIFICE_DB, EDIFICE_USER)
try:
DB_CONN=psycopg2.connect(database=EDIFICE_DB, user=EDIFICE_USER)
DB_CONN.set_session(autocommit=True) # Crucial so that we don't have really long-running transaction sequences
except psycopg2.OperationalError as e:
print e
sys.exit(1)
print "Importing datasets from open data portals. this will take a while..."
for d in datasets:
process_data(d)
DB_CONN.close()
print '======= Done! Happy Edificing! ======='
# if no actionable args, print out help message!
if ((not args.create_template) and (not args.create) and (not args.data)):
parser.print_help()
if __name__ == "__main__":
main()