-
Notifications
You must be signed in to change notification settings - Fork 37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Relative to absolute #32
base: master
Are you sure you want to change the base?
Changes from 7 commits
500cf9c
b9e3ff7
b1a7342
d457566
b577bd5
7be3063
28977ec
744e466
3602f31
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,10 +16,19 @@ def get_items(location, encoding=None): | |
Pass in a string or file-like object and get a list of Items present in the | ||
HTML document. | ||
""" | ||
try: | ||
from urllib.request import urlopen | ||
except ImportError: | ||
from urllib import urlopen | ||
|
||
dom_builder = html5lib.treebuilders.getTreeBuilder("dom") | ||
parser = html5lib.HTMLParser(tree=dom_builder) | ||
tree = parser.parse(location, encoding=encoding) | ||
return _find_items(tree) | ||
try: | ||
tree = parser.parse(urlopen(location), encoding=encoding) | ||
except ValueError: | ||
# Try opening it as a local file | ||
tree = parser.parse(open(location), encoding=encoding) | ||
return _find_items(tree, URI.get_domain(location)) | ||
|
||
|
||
class Item(object): | ||
|
@@ -29,15 +38,15 @@ class Item(object): | |
or another Item. | ||
""" | ||
|
||
def __init__(self, itemtype=None, itemid=None): | ||
def __init__(self, itemtype=None, itemid=None, domain=""): | ||
"""Create an Item, with an optional itemptype and/or itemid. | ||
""" | ||
# itemtype can be a space delimited list | ||
if itemtype: | ||
self.itemtype = [URI(i) for i in itemtype.split(" ")] | ||
self.itemtype = [URI(i, domain=domain) for i in itemtype.split(" ")] | ||
|
||
if itemid: | ||
self.itemid = URI(itemid) | ||
self.itemid = URI(itemid, domain=domain) | ||
|
||
self.props = {} | ||
|
||
|
@@ -104,8 +113,11 @@ def json_dict(self): | |
|
||
class URI(object): | ||
|
||
def __init__(self, string): | ||
self.string = string | ||
def __init__(self, string, domain=""): | ||
if string.startswith("http://") or string.startswith("https://"): | ||
self.string = string | ||
else: | ||
self.string = "/".join(("http:", "", domain, string)) | ||
|
||
def __eq__(self, other): | ||
if isinstance(other, URI): | ||
|
@@ -115,6 +127,15 @@ def __eq__(self, other): | |
def __repr__(self): | ||
return self.string | ||
|
||
@staticmethod | ||
def get_domain(url_string): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps this should be replaced with something using >>> urlparse('https://example.com/foo/bar')
ParseResult(scheme='https', netloc='example.com', path='/foo/bar', params='', query='', fragment='') There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. urlparse expects a URL:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not really knowledgeable about the field here but my point is, are we sure to never encounter malformed-URLs ? I get it that urllib expects well-formed URLs but shouldn't a microdata parser be more forgiving? Hence I manually check and adapt in case the string leaves out the protocol part. |
||
""" | ||
Get the domain _including_ the protocol specified, if any. | ||
""" | ||
if "://" in url_string: | ||
return "/".join(url_string.split("/")[0:3]) | ||
else: | ||
return url_string.split("/")[0] | ||
|
||
# what follows are the guts of extracting the Items from a DOM | ||
|
||
|
@@ -134,23 +155,23 @@ def __repr__(self): | |
} | ||
|
||
|
||
def _find_items(e): | ||
def _find_items(e, domain=""): | ||
items = [] | ||
unlinked = [] | ||
if _is_element(e) and _is_itemscope(e): | ||
item = _make_item(e) | ||
unlinked = _extract(e, item) | ||
item = _make_item(e, domain=domain) | ||
unlinked = _extract(e, item, domain=domain) | ||
items.append(item) | ||
for unlinked_element in unlinked: | ||
items.extend(_find_items(unlinked_element)) | ||
items.extend(_find_items(unlinked_element, domain=domain)) | ||
else: | ||
for child in e.childNodes: | ||
items.extend(_find_items(child)) | ||
items.extend(_find_items(child, domain=domain)) | ||
|
||
return items | ||
|
||
|
||
def _extract(e, item): | ||
def _extract(e, item, domain=""): | ||
# looks in a DOM element for microdata to assign to an Item | ||
# _extract returns a list of elements which appeared to have microdata | ||
# but which were not directly related to the Item that was passed in | ||
|
@@ -160,19 +181,19 @@ def _extract(e, item): | |
itemprop = _attr(child, "itemprop") | ||
itemscope = _is_itemscope(child) | ||
if itemprop and itemscope: | ||
nested_item = _make_item(child) | ||
unlinked.extend(_extract(child, nested_item)) | ||
nested_item = _make_item(child, domain=domain) | ||
unlinked.extend(_extract(child, nested_item, domain=domain)) | ||
item.set(itemprop, nested_item) | ||
elif itemprop: | ||
value = _property_value(child) | ||
value = _property_value(child, domain=domain) | ||
# itemprops may also be in a space delimited list | ||
for i in itemprop.split(" "): | ||
item.set(i, value) | ||
unlinked.extend(_extract(child, item)) | ||
unlinked.extend(_extract(child, item, domain=domain)) | ||
elif itemscope: | ||
unlinked.append(child) | ||
else: | ||
unlinked.extend(_extract(child, item)) | ||
unlinked.extend(_extract(child, item, domain=domain)) | ||
|
||
return unlinked | ||
|
||
|
@@ -193,11 +214,11 @@ def _is_itemscope(e): | |
return _attr(e, "itemscope") is not None | ||
|
||
|
||
def _property_value(e): | ||
def _property_value(e, domain=""): | ||
value = None | ||
attrib = property_values.get(e.tagName, None) | ||
if attrib in ["href", "src"]: | ||
value = URI(e.getAttribute(attrib)) | ||
value = URI(e.getAttribute(attrib), domain) | ||
elif attrib: | ||
value = e.getAttribute(attrib) | ||
else: | ||
|
@@ -216,20 +237,15 @@ def _text(e): | |
return ''.join(chunks) | ||
|
||
|
||
def _make_item(e): | ||
def _make_item(e, domain=""): | ||
if not _is_itemscope(e): | ||
raise Exception("element is not an Item") | ||
itemtype = _attr(e, "itemtype") | ||
itemid = _attr(e, "itemid") | ||
return Item(itemtype, itemid) | ||
return Item(itemtype, itemid, domain=domain) | ||
|
||
|
||
if __name__ == "__main__": | ||
try: | ||
from urllib.request import urlopen | ||
except ImportError: | ||
from urllib import urlopen | ||
|
||
if len(sys.argv) < 2: | ||
print("Usage: %s URL [...]" % sys.argv[0]) | ||
sys.exit(1) | ||
|
@@ -240,7 +256,7 @@ def _make_item(e): | |
microdata = {} | ||
microdata['items'] = items = [] | ||
|
||
for item in get_items(urlopen(url)): | ||
for item in get_items(url): | ||
items.append(item.json_dict()) | ||
|
||
print(json.dumps(microdata, indent=2)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This feels like it's duplicating the stdlib
urljoin
semantics:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A few questions:
.travis.yml
this project is officially on Python 3. As such I should useurllib.parse
right?urljoin
semantics. This conditional just ensures that the string starts with either "http" or "https".urljoin
does not do that:Can you clarify if I'm missing something here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
urljoin expects its first parameter to be a URL: