From b61a10db24f635f4fae6f3c5b210c7ce8b54750a Mon Sep 17 00:00:00 2001 From: Josh Kuhn Date: Mon, 31 Mar 2014 22:44:56 -0400 Subject: [PATCH] Initial work on a regex implementation. --- test/Makefile | 5 +- test/regex_test.py | 175 +++++++++++++++++++++++++++++++++++++ uritemplate/__init__.py | 2 +- uritemplate/uritemplate.py | 111 +++++++++++++++++++++++ 4 files changed, 291 insertions(+), 2 deletions(-) create mode 100644 test/regex_test.py diff --git a/test/Makefile b/test/Makefile index 85d5ee9..7b9813a 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,7 +1,7 @@ LEVEL = 4 -test: spec-examples spec-examples-by-section extended variables +test: spec-examples spec-examples-by-section extended variables regex spec-examples: PYTHONPATH=..:$(PYTHONPATH) python uritemplate_test.py cases/spec-examples.json $(LEVEL) @@ -17,3 +17,6 @@ negative: variables: PYTHONPATH=..:$(PYTHONPATH) python variables_test.py + +regex: + PYTHONPATH=..:$(PYTHONPATH) python regex_test.py diff --git a/test/regex_test.py b/test/regex_test.py new file mode 100644 index 0000000..f93ad69 --- /dev/null +++ b/test/regex_test.py @@ -0,0 +1,175 @@ +''' +Tests related to the as_regex function. Uses the same testcases as the +expand function, but repurposes them to ensure the regex created +produces the right output. +''' +import sys +from os.path import join, dirname +try: + import json +except ImportError: + import simplejson as json +import urllib +import traceback +import pdb + +import uritemplate + +TESTFILES = [ + 'spec-examples.json', + 'spec-examples-by-section.json', + 'extended-tests.json', +] + + +def correct_answers(var): + '''Take a variable and produce a list of possibly correct + answers''' + safe = ":/?#[]@!$&'()*+,;=" + def quote(v): + v = '' if v is None else v + return urllib.quote(str(v), '') + + def restrictquote(v): + v = '' if v is None else v + return urllib.quote(str(v), safe) + + if isinstance(var, list): + return [','.join(map(restrictquote, var)), + ','.join(map(quote, var))] + elif isinstance(var, dict): + return [urllib.urlencode(var, safe), + ','.join(restrictquote(v) + for item in var.iteritems() for v in item), + ','.join(quote(v) for item in var.iteritems() for v in item), + ] + else: + return [restrictquote(var), quote(var)] + +def _print_level(level, prefix): + def _print_method(self, tpl, *args, **kwargs): + if self.verbosity >= level: + print prefix, tpl.format(*args, **kwargs) + return _print_method + + +class TestRunner(object): + def __init__(self, verbosity=0, one_failure=False, fail_into_pdb=False): + self.verbosity = verbosity + self.one_failure = one_failure + self.fail_into_pdb = fail_into_pdb + + self.failures = 0 + self.successes = 0 + + print1 = _print_level(1, '||') + print2 = _print_level(2, ';;') + print3 = _print_level(3, ',,') + print4 = _print_level(4, '. ') + + def main(self): + cases_dir = join(dirname(__file__), 'cases') + for testfile in TESTFILES: + self.print2('Running Testfile: {0}', testfile) + self.print2('=' * 80) + with open(join(cases_dir, testfile), 'r') as tf: + self.test_document(json.load(tf)) + self.finish() + + def test_document(self, test_doc): + for testname, testdef in sorted(test_doc.iteritems()): + fails, succeeds = 0, 0 + self.print2('{0}:', testname) + variables = testdef['variables'] + testcases = testdef['testcases'] + for major_num, (template, inputs) in enumerate(testcases, 1): + if not isinstance(inputs, list): + # Correct for multiple 'expected' + inputs = [inputs] + for minor_num, to_match in enumerate(inputs, 1): + self.print3(' Case # {0}.{1}', major_num, minor_num) + if not self.test(variables, template, to_match): + fails += 1 + self.failures += 1 + if self.one_failure: + self.finish() + else: + self.successes += 1 + succeeds += 1 + self.print2(" {0} Successes, {1} Failures", succeeds, fails) + + def finish(self, final=False): + self.print1('{0} tests succeeded.', self.successes) + self.print1('{0} tests failed', self.failures) + sys.exit(self.failures) + + def test(self, variables, template, to_match): + # Normalize url escaping since mixed quoting is not what the + # regex will be used for + self.print4("'{0}' matching '{1}'", template, to_match) + try: + testvars = uritemplate.variables(template) + regex = uritemplate.as_regex(template) + except Exception as e: + if self.fail_into_pdb: + pdb.post_mortem() + self.print4(traceback.format_exc()) + self.print3(' Failed with: ' + repr(e)) + return False + self.print4('Regex is: {0}', regex.pattern) + + try: + matchvars = regex.match(to_match).groupdict() + except AttributeError: + if self.fail_into_pdb: + pdb.post_mortem() + self.print3(' Failed with: Regex did not match expected') + return False + + for var in testvars: + match_var = matchvars.get(var) + if not self.matches(match_var, variables[var], var): + if self.fail_into_pdb: + pdb.set_trace() + return False + return True + + def matches(self, match_var, expect_var, varname): + possible_correct = correct_answers(expect_var) + for answer in possible_correct: + if answer.startswith(match_var): + result = True + break + else: + result = False + self.print3_expectation(match_var, possible_correct, varname) + return result + + def print3_expectation(self, match_var, answers, var): + if len(set(answers)) == 1: + outstring = " For '{var}' expected '{varU}',"\ + " got '{match_var}'" + else: + outstring = " For '{var}' expected one of {answers!r}"\ + ", got '{match_var}'" + self.print3( + outstring, var=var, match_var=match_var, answers=answers) + + +if __name__ == "__main__": + verbosity, one_failure = 0, False + if '-1' in sys.argv: + verbosity = 1 + if '-2' in sys.argv: + verbosity = 2 + if '-3' in sys.argv: + verbosity = 3 + if '-4' in sys.argv: + verbosity = 4 + TR = TestRunner( + verbosity=verbosity, + one_failure='-x' in sys.argv, + fail_into_pdb='-pdb' in sys.argv, + ) + TR.main() + diff --git a/uritemplate/__init__.py b/uritemplate/__init__.py index dc646f9..1d02923 100644 --- a/uritemplate/__init__.py +++ b/uritemplate/__init__.py @@ -1,4 +1,4 @@ -from uritemplate import expand, variables +from uritemplate import expand, variables, as_regex __version__ = "0.5.2" diff --git a/uritemplate/uritemplate.py b/uritemplate/uritemplate.py index d8c7f1a..5da947a 100644 --- a/uritemplate/uritemplate.py +++ b/uritemplate/uritemplate.py @@ -46,6 +46,117 @@ def variables(template): return vars +class Regexifier(object): + + PERCENT_ENCODED = r'%[a-zA-Z0-9][a-zA-Z0-9]' + # Add ',' to unreserved since a single variable can be expanded to + # multiple values if a composite value is passed in + UNRESERVED = r'''[a-zA-Z0-9_.~\-,]''' + RESERVED = r'''[:/?#\[\]@!$&'()*+,;=]''' + + @classmethod + def capture(cls, varname, op=None, joiner=',', cap=None, explode=False): + '''Returns a string for a variable capture regex for the given + operation''' + if op in ('+', '#'): + allowed = '|'.join([cls.UNRESERVED.replace(joiner, ''), + cls.RESERVED.replace(joiner, ''), + cls.PERCENT_ENCODED]) + else: + allowed = '|'.join([cls.UNRESERVED.replace(op, ''), + cls.PERCENT_ENCODED]) + + return r'''{key}(?P<{varname}>(?:{allowed}){repeat})'''.format( + key=varname + '=?' if cls.is_keyval(op) else '', + varname=varname, + allowed=allowed, + repeat = '{0,' + cap + '}' if cap else '*' + ) + + @classmethod + def is_keyval(cls, op): + return op in (';', '?', '&') + + @classmethod + def escape_or_substitute(cls, section): + '''If the section passed in is a variable expression, replace + it with a variable capture group. Otherwise, escape it so + characters don't interfere with the final regex''' + match = TEMPLATE.match(section) + if not match: + return re.escape(section) + else: + return cls.process_expression(match.group(1)) + + @classmethod + def process_expression(cls, expression): + '''Breaks a variable expression into its parts and creates the + proper regex for them''' + if expression[0] in OPERATOR: + op = expression[0] + expression = expression[1:] + else: + op = '' + expressions = cls.split_vars(expression) + joiner = cls.joiner_for(op) + prefix = cls.prefix_for(op) + pieces = (cls.capture(varname, op, joiner, cap, explode) + for varname, cap, explode in expressions) + # Need to escape joiner since some of them have regex meaning + return prefix + ('\\' + joiner).join(pieces) + + @classmethod + def split_vars(cls, expression): + vars = expression.split(',') + def cap_or_none(var): + explode = False + try: + var, cap = var.split(':') + except Exception: + var, cap = var, None + if var.endswith('*'): + explode = True + var = var[:-1] + return var, cap, explode + return (cap_or_none(var) for var in vars) + + @classmethod + def joiner_for(cls, op): + if op in ('?', '&'): + return '&' + elif op in ('.', '/', ';'): + return op + else: + return ',' + + @classmethod + def prefix_for(cls, op): + if op in ('#', '?', '.', '/', ';', '&'): + return '\\' + op + else: + return '' + + @classmethod + def explode(cls, string): + '''Explodes a string based on variable expressions.''' + var_expr = re.compile(r'({[^\\}]+})') + return var_expr.split(string) + + @classmethod + def regexify(cls, uritemplate): + '''Converts the given uritemplate and converts it to a regex + with named capture groups for each template variable''' + exploded = cls.explode(uritemplate) + return re.compile(''.join( + cls.escape_or_substitute(section) for section in exploded)) + + +def as_regex(uritemplate): + '''Returns a regex matching the given template, with capture + groups named after the template variable names''' + return Regexifier.regexify(uritemplate) + + def _quote(value, safe, prefix=None): if prefix is not None: return quote(str(value)[:prefix], safe)