Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Relative to absolute #32

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 44 additions & 28 deletions microdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,19 @@ def get_items(location, encoding=None):
Pass in a string or file-like object and get a list of Items present in the
HTML document.
"""
try:
from urllib.request import urlopen
except ImportError:
from urllib import urlopen

dom_builder = html5lib.treebuilders.getTreeBuilder("dom")
parser = html5lib.HTMLParser(tree=dom_builder)
tree = parser.parse(location, encoding=encoding)
return _find_items(tree)
try:
tree = parser.parse(urlopen(location), encoding=encoding)
except ValueError:
# Try opening it as a local file
tree = parser.parse(open(location), encoding=encoding)
return _find_items(tree, URI.get_domain(location))


class Item(object):
Expand All @@ -29,15 +38,15 @@ class Item(object):
or another Item.
"""

def __init__(self, itemtype=None, itemid=None):
def __init__(self, itemtype=None, itemid=None, domain=""):
"""Create an Item, with an optional itemptype and/or itemid.
"""
# itemtype can be a space delimited list
if itemtype:
self.itemtype = [URI(i) for i in itemtype.split(" ")]
self.itemtype = [URI(i, domain=domain) for i in itemtype.split(" ")]

if itemid:
self.itemid = URI(itemid)
self.itemid = URI(itemid, domain=domain)

self.props = {}

Expand Down Expand Up @@ -104,8 +113,11 @@ def json_dict(self):

class URI(object):

def __init__(self, string):
self.string = string
def __init__(self, string, domain=""):
if string.startswith("http://") or string.startswith("https://"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels like it's duplicating the stdlib urljoin semantics:

>>> from urlparse import urljoin
>>> urljoin('https://example.com/foo/bar', '/baaz')
'https://example.com/baaz'
>>> urljoin('https://example.com/foo/bar', 'http://quux/baaz')
'http://quux/baaz'

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A few questions:

  1. Judging by the .travis.yml this project is officially on Python 3. As such I should use urllib.parse right?
  2. Sorry I'm not sure I agree with you that this bit is duplicating urljoin semantics. This conditional just ensures that the string starts with either "http" or "https". urljoin does not do that:
Python 3.5.2 (default, Nov 12 2018, 13:43:14) 
[GCC 5.4.0 20160609] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from urllib.parse import urljoin
>>> urljoin("github.com", "edsu")
'edsu'

Can you clarify if I'm missing something here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

urljoin expects its first parameter to be a URL:

>>> from urllib.parse import urljoin
>>> urljoin("https://github.com", "edsu")
'https://github.com/edsu'

self.string = string
else:
self.string = "/".join(("http:", "", domain, string))

def __eq__(self, other):
if isinstance(other, URI):
Expand All @@ -115,6 +127,15 @@ def __eq__(self, other):
def __repr__(self):
return self.string

@staticmethod
def get_domain(url_string):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps this should be replaced with something using urlparse, which returns a handy named tuple:

>>> urlparse('https://example.com/foo/bar')
ParseResult(scheme='https', netloc='example.com', path='/foo/bar', params='', query='', fragment='')

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used urlparse inside this method instead of manually splitting and joining but urlparse can't be a drop-in replacement for what I want to achieve here:

>>> urlparse("github.com/edsu/microdata/pull/32/files")
ParseResult(scheme='', netloc='', path='github.com/edsu/microdata/pull/32/files', params='', query='', fragment='')

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

urlparse expects a URL:

>>> urlparse("https://github.com/edsu/microdata/pull/32/files")
ParseResult(scheme='https', netloc='github.com', path='/edsu/microdata/pull/32/files', params='', query='', fragment='')

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not really knowledgeable about the field here but my point is, are we sure to never encounter malformed-URLs ? I get it that urllib expects well-formed URLs but shouldn't a microdata parser be more forgiving? Hence I manually check and adapt in case the string leaves out the protocol part.

"""
Get the domain _including_ the protocol specified, if any.
"""
if "://" in url_string:
return "/".join(url_string.split("/")[0:3])
else:
return url_string.split("/")[0]

# what follows are the guts of extracting the Items from a DOM

Expand All @@ -134,23 +155,23 @@ def __repr__(self):
}


def _find_items(e):
def _find_items(e, domain=""):
items = []
unlinked = []
if _is_element(e) and _is_itemscope(e):
item = _make_item(e)
unlinked = _extract(e, item)
item = _make_item(e, domain=domain)
unlinked = _extract(e, item, domain=domain)
items.append(item)
for unlinked_element in unlinked:
items.extend(_find_items(unlinked_element))
items.extend(_find_items(unlinked_element, domain=domain))
else:
for child in e.childNodes:
items.extend(_find_items(child))
items.extend(_find_items(child, domain=domain))

return items


def _extract(e, item):
def _extract(e, item, domain=""):
# looks in a DOM element for microdata to assign to an Item
# _extract returns a list of elements which appeared to have microdata
# but which were not directly related to the Item that was passed in
Expand All @@ -160,19 +181,19 @@ def _extract(e, item):
itemprop = _attr(child, "itemprop")
itemscope = _is_itemscope(child)
if itemprop and itemscope:
nested_item = _make_item(child)
unlinked.extend(_extract(child, nested_item))
nested_item = _make_item(child, domain=domain)
unlinked.extend(_extract(child, nested_item, domain=domain))
item.set(itemprop, nested_item)
elif itemprop:
value = _property_value(child)
value = _property_value(child, domain=domain)
# itemprops may also be in a space delimited list
for i in itemprop.split(" "):
item.set(i, value)
unlinked.extend(_extract(child, item))
unlinked.extend(_extract(child, item, domain=domain))
elif itemscope:
unlinked.append(child)
else:
unlinked.extend(_extract(child, item))
unlinked.extend(_extract(child, item, domain=domain))

return unlinked

Expand All @@ -193,11 +214,11 @@ def _is_itemscope(e):
return _attr(e, "itemscope") is not None


def _property_value(e):
def _property_value(e, domain=""):
value = None
attrib = property_values.get(e.tagName, None)
if attrib in ["href", "src"]:
value = URI(e.getAttribute(attrib))
value = URI(e.getAttribute(attrib), domain)
elif attrib:
value = e.getAttribute(attrib)
else:
Expand All @@ -216,20 +237,15 @@ def _text(e):
return ''.join(chunks)


def _make_item(e):
def _make_item(e, domain=""):
if not _is_itemscope(e):
raise Exception("element is not an Item")
itemtype = _attr(e, "itemtype")
itemid = _attr(e, "itemid")
return Item(itemtype, itemid)
return Item(itemtype, itemid, domain=domain)


if __name__ == "__main__":
try:
from urllib.request import urlopen
except ImportError:
from urllib import urlopen

if len(sys.argv) < 2:
print("Usage: %s URL [...]" % sys.argv[0])
sys.exit(1)
Expand All @@ -240,7 +256,7 @@ def _make_item(e):
microdata = {}
microdata['items'] = items = []

for item in get_items(urlopen(url)):
for item in get_items(url):
items.append(item.json_dict())

print(json.dumps(microdata, indent=2))
26 changes: 19 additions & 7 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class MicrodataParserTest(unittest.TestCase):
def test_parse(self):

# parse the html for microdata
items = get_items(open("test-data/example.html"))
items = get_items("test-data/example.html")

# this html should have just one main item
self.assertTrue(len(items), 1)
Expand Down Expand Up @@ -55,7 +55,7 @@ def test_parse(self):
def test_parse_nested(self):

# parse the html for microdata
items = get_items(open("test-data/example-nested.html"))
items = get_items("test-data/example-nested.html")

# this html should have just one main item
self.assertTrue(len(items), 1)
Expand All @@ -71,7 +71,7 @@ def test_parse_nested(self):
# test case of a nested itemscope
self.assertTrue(isinstance(item.location, Item))
self.assertEqual(item.location.itemtype, [URI("http://schema.org/Place")])
self.assertEqual(item.location.url, URI("wells-fargo-center.html"))
self.assertEqual(item.location.url, URI("wells-fargo-center.html", domain="test-data"))

# address should be a nested item
self.assertTrue(isinstance(item.location.address, Item))
Expand All @@ -82,14 +82,14 @@ def test_parse_nested(self):
i = json.loads(item.json())
self.assertEqual(i["properties"]["name"][0].strip(), "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)")
self.assertEqual(i["type"], ["http://schema.org/Event"])
self.assertEqual(i["properties"]["url"], ["nba-miami-philidelphia-game3.html"])
self.assertEqual(i["properties"]["url"], ["http://test-data/nba-miami-philidelphia-game3.html"])
self.assertTrue(isinstance(i["properties"]["location"][0], dict))
self.assertEqual(i["properties"]["location"][0]["properties"]["url"][0], "wells-fargo-center.html")
self.assertEqual(i["properties"]["location"][0]["properties"]["url"][0], "http://test-data/wells-fargo-center.html")
self.assertTrue(isinstance(i["properties"]["location"][0]["properties"]["address"][0], dict))
self.assertEqual(i["properties"]["location"][0]["properties"]["address"][0]["properties"]["addressLocality"][0], "Philadelphia")

def test_parse_unlinked(self):
items = get_items(open("test-data/unlinked.html"))
items = get_items("test-data/unlinked.html")
self.assertEqual(len(items), 2)

i = items[0]
Expand All @@ -108,10 +108,22 @@ def test_parse_unlinked(self):
self.assertTrue('Whitworth' in i.streetAddress)

def test_skip_level(self):
items = get_items(open("test-data/skip-level.html"))
items = get_items("test-data/skip-level.html")
self.assertEqual(len(items), 1)
self.assertEqual(items[0].name, "Jane Doe")


class URITest(unittest.TestCase):

def test_get_domain(self):
https_start = "https://github.com/edsu/microdata"
self.assertEqual("https://github.com", URI.get_domain(https_start))

no_https = "github.com/edsu/microdata"
self.assertEqual("github.com", URI.get_domain(no_https))

plain = "github.com"
self.assertEqual(plain, URI.get_domain(plain))


if __name__ == "__main__":
Expand Down