From 83ca59cb4ecc78818525f5ba0474900687bfb378 Mon Sep 17 00:00:00 2001 From: Anthony Li Date: Sun, 8 Sep 2019 01:53:09 -0400 Subject: [PATCH] Dynamic sizing, begin work on docker support --- .gitignore | 1 + Dockerfile | 8 ++++ DownloadRunner.py | 112 +++++++++++++++++++++++++++++++++++----------- 3 files changed, 95 insertions(+), 26 deletions(-) create mode 100644 Dockerfile diff --git a/.gitignore b/.gitignore index 1b60898..d9787eb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.vscode tohme_crops *.pyc old diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3fb7994 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,8 @@ +FROM ubuntu:16.04 +COPY . /app +WORKDIR /app +RUN apt-get update +RUN apt-get install -y python-pip libfreetype6-dev libxft-dev python-dev libjpeg8-dev libblas-dev liblapack-dev libatlas-base-dev gfortran python-tk +RUN pip install -r requirements.txt +ENTRYPOINT ["python", "DownloadRunner.py"] +CMD [] \ No newline at end of file diff --git a/DownloadRunner.py b/DownloadRunner.py index 2dd998e..b51f809 100644 --- a/DownloadRunner.py +++ b/DownloadRunner.py @@ -1,11 +1,7 @@ -# !/usr/bin/python - -# **************************************** -# Specify the file storage location below -# **************************************** -storage_location = "/home/anthony/Downloads/test/" +# !/usr/bin/python2 from SidewalkDB import * +from sys import argv import os import httplib @@ -20,6 +16,29 @@ from random import shuffle import fnmatch +from subprocess import PIPE,STDOUT + +try: + from xml.etree import cElementTree as ET +except ImportError, e: + from xml.etree import ElementTree as ET + +delay = 30 + +if len(argv) != 3: + print("Usage: python DownloadRunner.py sidewalk_server_domain storage_path") + print(" sidewalk_server_domain - FDQN of SidewalkWebpage server to fetch pano list from") + print(" storage_path - location to store scraped panos") + print(" Example: python DownloadRunner.py sidewalk-sea.cs.washington.edu /destination/path") + exit(0) + +sidewalk_server_fqdn = argv[1] +storage_location = argv[2] + +if not os.path.exists(storage_location): + os.mkdir(storage_location) + +print("Starting run with pano list fetched from %s and destination path %s" % (sidewalk_server_fqdn, storage_location)) def check_download_failed_previously(panoId): if panoId in open('scrape.log').read(): @@ -27,10 +46,21 @@ def check_download_failed_previously(panoId): else: return False +def extract_panowidthheight(path_to_metadata_xml): + pano = {} + pano_xml = open(path_to_metadata_xml, 'rb') + tree = ET.parse(pano_xml) + root = tree.getroot() + for child in root: + if child.tag == 'data_properties': + pano[child.tag] = child.attrib + + return (int(pano['data_properties']['image_width']),int(pano['data_properties']['image_height'])) + def fetch_pano_ids_from_webserver(): unique_ids = [] - conn = httplib.HTTPSConnection("sidewalk.umiacs.umd.edu") + conn = httplib.HTTPSConnection(sidewalk_server_fqdn) conn.request("GET", "/adminapi/labels/panoid") r1 = conn.getresponse() data = r1.read() @@ -39,7 +69,11 @@ def fetch_pano_ids_from_webserver(): for value in jsondata["features"]: if value["properties"]["gsv_panorama_id"] not in unique_ids: - unique_ids.append(value["properties"]["gsv_panorama_id"]) + #Check if the pano_id is an empty string + if value["properties"]["gsv_panorama_id"]: + unique_ids.append(value["properties"]["gsv_panorama_id"]) + else: + print "Pano ID is an empty string" return unique_ids @@ -53,11 +87,18 @@ def download_panorama_images(storage_path, pano_list=None): counter = 0 failed = 0 - im_dimension = (512 * 26, 512 * 13) - blank_image = Image.new('RGBA', im_dimension, (0, 0, 0, 0)) + base_url = 'http://maps.google.com/cbk?' shuffle(unique_ids) for pano_id in unique_ids: + + pano_xml_path = os.path.join(storage_path, pano_id[:2], pano_id + ".xml") + if not os.path.isfile(pano_xml_path): + continue + (image_width,image_height) = extract_panowidthheight(pano_xml_path) + im_dimension = (image_width, image_height) + blank_image = Image.new('RGBA', im_dimension, (0, 0, 0, 0)) + print '-- Extracting images for', pano_id, destination_dir = os.path.join(storage_path, pano_id[:2]) @@ -65,7 +106,6 @@ def download_panorama_images(storage_path, pano_list=None): os.makedirs(destination_dir) filename = pano_id + ".jpg" - out_image_name = os.path.join(destination_dir, filename) if os.path.isfile(out_image_name): print 'File already exists.' @@ -74,8 +114,8 @@ def download_panorama_images(storage_path, pano_list=None): print 'Completed ' + str(counter) + ' of ' + str(len(unique_ids)) continue - for y in range(13): - for x in range(26): + for y in range(image_height / 512): + for x in range(image_width / 512): url_param = 'output=tile&zoom=5&x=' + str(x) + '&y=' + str( y) + '&cb_client=maps_sv&fover=2&onerr=3&renderer=spherical&v=4&panoid=' + pano_id url = base_url + url_param @@ -83,14 +123,21 @@ def download_panorama_images(storage_path, pano_list=None): # Open an image, resize it to 512x512, and paste it into a canvas req = urllib.urlopen(url) file = cStringIO.StringIO(req.read()) - im = Image.open(file) - im = im.resize((512, 512)) + try: + im = Image.open(file) + im = im.resize((512, 512)) + except Exception: + print 'Error. Image.open didnt work here for some reason' + print url + print y,x + print req + print pano_id blank_image.paste(im, (512 * x, 512 * y)) # Wait a little bit so you don't get blocked by Google - # sleep_in_milliseconds = float(delay) / 1000 - # sleep(sleep_in_milliseconds) + sleep_in_milliseconds = float(delay) / 1000 + sleep(sleep_in_milliseconds) print '.', print @@ -119,8 +166,8 @@ def download_panorama_images(storage_path, pano_list=None): temp_blank_image.paste(im, (512 * x, 512 * y)) # Wait a little bit so you don't get blocked by Google - # sleep_in_milliseconds = float(delay) / 1000 - # sleep(sleep_in_milliseconds) + sleep_in_milliseconds = float(delay) / 1000 + sleep(sleep_in_milliseconds) print '.', print temp_blank_image = temp_blank_image.resize(im_dimension, Image.ANTIALIAS) # resize @@ -161,10 +208,14 @@ def download_panorama_depthdata(storage_path, decode=True, pano_list=None): continue url = base_url + pano_id - with open(destination_file, 'wb') as f: - req = urllib2.urlopen(url) - for line in req: - f.write(line) + try: + with open(destination_file, 'wb') as f: + req = urllib2.urlopen(url) + for line in req: + f.write(line) + except: + print 'Unable to download depth data for pano.' + continue print 'Done.' @@ -184,11 +235,20 @@ def generate_depthmapfiles(path_to_scrapes): # Generate a .depth.txt file for the .xml file output_file = os.path.join(root, pano_id + ".depth.txt") - call(["./decode_depthmap", xml_location, output_file]) + if os.path.isfile(output_file): + print 'Depth file already exists' + continue + output_code = call(["./decode_depthmap", xml_location, output_file]) + if output_code == 0: + print 'Succesfully converted ',pano_id,' to depth.txt' + else: + print 'Unsuccessful. Could not convert ',pano_id,' to depth.txt . Returned with error ',output_code +print "Fetching pano-ids" pano_list = fetch_pano_ids_from_webserver() - -download_panorama_images(storage_location, pano_list=pano_list) # Trailing slash required +# pano_list = [pano_list[111], pano_list[112]] +print "Fetching Panoramas" download_panorama_depthdata(storage_location, pano_list=pano_list) +download_panorama_images(storage_location, pano_list=pano_list) # Trailing slash required generate_depthmapfiles(storage_location)