edsu · skytreader · May 26, 2016 · May 26, 2016 · May 26, 2016 · May 26, 2016
diff --git a/microdata.py b/microdata.py
@@ -16,10 +16,19 @@ def get_items(location, encoding=None):
     Pass in a string or file-like object and get a list of Items present in the
     HTML document.
     """
+    try:
+        from urllib.request import urlopen
+    except ImportError:
+        from urllib import urlopen
+
     dom_builder = html5lib.treebuilders.getTreeBuilder("dom")
     parser = html5lib.HTMLParser(tree=dom_builder)
-    tree = parser.parse(location, encoding=encoding)
-    return _find_items(tree)
+    try:
+        tree = parser.parse(urlopen(location), encoding=encoding)
+    except ValueError:
+        # Try opening it as a local file
+        tree = parser.parse(open(location), encoding=encoding)
+    return _find_items(tree, URI.get_domain(location))
 
 
 class Item(object):
@@ -29,15 +38,15 @@ class Item(object):
     or another Item.
     """
 
-    def __init__(self, itemtype=None, itemid=None):
+    def __init__(self, itemtype=None, itemid=None, domain=""):
         """Create an Item, with an optional itemptype and/or itemid.
         """
         # itemtype can be a space delimited list
         if itemtype:
-            self.itemtype = [URI(i) for i in itemtype.split(" ")]
+            self.itemtype = [URI(i, domain=domain) for i in itemtype.split(" ")]
 
         if itemid:
-            self.itemid = URI(itemid)
+            self.itemid = URI(itemid, domain=domain)
 
         self.props = {}
 
@@ -104,8 +113,11 @@ def json_dict(self):
 
 class URI(object):
 
-    def __init__(self, string):
-        self.string = string
+    def __init__(self, string, domain=""):
+        if string.startswith("http://") or string.startswith("https://"):
+            self.string = string
+        else:
+            self.string = "/".join(("http:", "", domain, string))
 
     def __eq__(self, other):
         if isinstance(other, URI):
@@ -115,6 +127,15 @@ def __eq__(self, other):
     def __repr__(self):
         return self.string
 
+    @staticmethod
+    def get_domain(url_string):
+        """
+        Get the domain _including_ the protocol specified, if any.
+        """
+        if "://" in url_string:
+            return "/".join(url_string.split("/")[0:3])
+        else:
+            return url_string.split("/")[0]
 
 # what follows are the guts of extracting the Items from a DOM
 
@@ -134,23 +155,23 @@ def __repr__(self):
 }
 
 
-def _find_items(e):
+def _find_items(e, domain=""):
     items = []
     unlinked = []
     if _is_element(e) and _is_itemscope(e):
-        item = _make_item(e)
-        unlinked = _extract(e, item)
+        item = _make_item(e, domain=domain)
+        unlinked = _extract(e, item, domain=domain)
         items.append(item)
         for unlinked_element in unlinked:
-            items.extend(_find_items(unlinked_element))
+            items.extend(_find_items(unlinked_element, domain=domain))
     else:
         for child in e.childNodes:
-            items.extend(_find_items(child))
+            items.extend(_find_items(child, domain=domain))
 
     return items
 
 
-def _extract(e, item):
+def _extract(e, item, domain=""):
     # looks in a DOM element for microdata to assign to an Item
     # _extract returns a list of elements which appeared to have microdata
     # but which were not directly related to the Item that was passed in
@@ -160,19 +181,19 @@ def _extract(e, item):
         itemprop = _attr(child, "itemprop")
         itemscope = _is_itemscope(child)
         if itemprop and itemscope:
-            nested_item = _make_item(child)
-            unlinked.extend(_extract(child, nested_item))
+            nested_item = _make_item(child, domain=domain)
+            unlinked.extend(_extract(child, nested_item, domain=domain))
             item.set(itemprop, nested_item)
         elif itemprop:
-            value = _property_value(child)
+            value = _property_value(child, domain=domain)
             # itemprops may also be in a space delimited list
             for i in itemprop.split(" "):
                 item.set(i, value)
-            unlinked.extend(_extract(child, item))
+            unlinked.extend(_extract(child, item, domain=domain))
         elif itemscope:
             unlinked.append(child)
         else:
-            unlinked.extend(_extract(child, item))
+            unlinked.extend(_extract(child, item, domain=domain))
 
     return unlinked
 
@@ -193,11 +214,11 @@ def _is_itemscope(e):
     return _attr(e, "itemscope") is not None
 
 
-def _property_value(e):
+def _property_value(e, domain=""):
     value = None
     attrib = property_values.get(e.tagName, None)
     if attrib in ["href", "src"]:
-        value = URI(e.getAttribute(attrib))
+        value = URI(e.getAttribute(attrib), domain)
     elif attrib:
         value = e.getAttribute(attrib)
     else:
@@ -216,20 +237,15 @@ def _text(e):
     return ''.join(chunks)
 
 
-def _make_item(e):
+def _make_item(e, domain=""):
     if not _is_itemscope(e):
         raise Exception("element is not an Item")
     itemtype = _attr(e, "itemtype")
     itemid = _attr(e, "itemid")
-    return Item(itemtype, itemid)
+    return Item(itemtype, itemid, domain=domain)
 
 
 if __name__ == "__main__":
-    try:
-        from urllib.request import urlopen
-    except ImportError:
-        from urllib import urlopen
-
     if len(sys.argv) < 2:
         print("Usage: %s URL [...]" % sys.argv[0])
         sys.exit(1)
@@ -240,7 +256,7 @@ def _make_item(e):
         microdata = {}
         microdata['items'] = items = []
 
-        for item in get_items(urlopen(url)):
+        for item in get_items(url):
             items.append(item.json_dict())
 
         print(json.dumps(microdata, indent=2))
diff --git a/test.py b/test.py
@@ -12,7 +12,7 @@ class MicrodataParserTest(unittest.TestCase):
     def test_parse(self):
 
         # parse the html for microdata
-        items = get_items(open("test-data/example.html"))
+        items = get_items("test-data/example.html")
 
         # this html should have just one main item
         self.assertTrue(len(items), 1)
@@ -55,7 +55,7 @@ def test_parse(self):
     def test_parse_nested(self):
 
         # parse the html for microdata
-        items = get_items(open("test-data/example-nested.html"))
+        items = get_items("test-data/example-nested.html")
 
         # this html should have just one main item
         self.assertTrue(len(items), 1)
@@ -71,7 +71,7 @@ def test_parse_nested(self):
         # test case of a nested itemscope
         self.assertTrue(isinstance(item.location, Item))
         self.assertEqual(item.location.itemtype, [URI("http://schema.org/Place")])
-        self.assertEqual(item.location.url, URI("wells-fargo-center.html"))
+        self.assertEqual(item.location.url, URI("wells-fargo-center.html", domain="test-data"))
 
         # address should be a nested item
         self.assertTrue(isinstance(item.location.address, Item))
@@ -82,14 +82,14 @@ def test_parse_nested(self):
         i = json.loads(item.json())
         self.assertEqual(i["properties"]["name"][0].strip(), "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)")
         self.assertEqual(i["type"], ["http://schema.org/Event"])
-        self.assertEqual(i["properties"]["url"], ["nba-miami-philidelphia-game3.html"])
+        self.assertEqual(i["properties"]["url"], ["http://test-data/nba-miami-philidelphia-game3.html"])
         self.assertTrue(isinstance(i["properties"]["location"][0], dict))
-        self.assertEqual(i["properties"]["location"][0]["properties"]["url"][0], "wells-fargo-center.html")
+        self.assertEqual(i["properties"]["location"][0]["properties"]["url"][0], "http://test-data/wells-fargo-center.html")
         self.assertTrue(isinstance(i["properties"]["location"][0]["properties"]["address"][0], dict))
         self.assertEqual(i["properties"]["location"][0]["properties"]["address"][0]["properties"]["addressLocality"][0], "Philadelphia")
 
     def test_parse_unlinked(self):
-        items = get_items(open("test-data/unlinked.html"))
+        items = get_items("test-data/unlinked.html")
         self.assertEqual(len(items), 2)
 
         i = items[0]
@@ -108,10 +108,22 @@ def test_parse_unlinked(self):
         self.assertTrue('Whitworth' in i.streetAddress)
 
     def test_skip_level(self):
-        items = get_items(open("test-data/skip-level.html"))
+        items = get_items("test-data/skip-level.html")
         self.assertEqual(len(items), 1)
         self.assertEqual(items[0].name, "Jane Doe")
 
+
+class URITest(unittest.TestCase):
+
+    def test_get_domain(self):
+        https_start = "https://github.com/edsu/microdata"
+        self.assertEqual("https://github.com", URI.get_domain(https_start))
+
+        no_https = "github.com/edsu/microdata"
+        self.assertEqual("github.com", URI.get_domain(no_https))
+
+        plain = "github.com"
+        self.assertEqual(plain, URI.get_domain(plain))
 
 
 if __name__ == "__main__":