diff --git a/CHANGELOG.md b/CHANGELOG.md index 0749d1e..0a26fff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Change Log +## [v1.3.1] + +**Release date: 2024-08-06** + +### Fixed + +* Double-quote characters are no longer parsed as roles or symbols ([#143]) + + ## [v1.3.0] **Release date: 2023-11-14** diff --git a/docs/notation.rst b/docs/notation.rst index 29e7a18..0c20c42 100644 --- a/docs/notation.rst +++ b/docs/notation.rst @@ -90,8 +90,10 @@ grammar to allow for surface alignments. Symbol <- NameChar+ Role <- ':' NameChar* Alignment <- '~' ([a-zA-Z] '.'?)? Digit+ (',' Digit+)* - String <- '"' (!'"' ('\\' . / .))* '"' - NameChar <- ![ \n\t\r\f\v()/:~] . + String <- '"' (!'"' (StrEscape / StrChar))* '"' + StrEscape <- '\\' StrChar + StrChar <- ![\n\r\f\v] . + NameChar <- ![ \n\t\r\f\v"()/:~] . Digit <- [0-9] This grammar has some seemingly unnecessary ambiguity in that both the diff --git a/penman/__about__.py b/penman/__about__.py index b9824d9..b0b99b3 100644 --- a/penman/__about__.py +++ b/penman/__about__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -__version__ = '1.3.0' +__version__ = '1.3.1' __version_info__ = tuple( int(x) if x.isdigit() else x for x in __version__.replace('.', ' ').replace('-', ' ').split() diff --git a/penman/_lexer.py b/penman/_lexer.py index 3b7fee0..65eddd2 100644 --- a/penman/_lexer.py +++ b/penman/_lexer.py @@ -22,8 +22,8 @@ 'ALIGNMENT': r'~(?:[a-z]\.?)?[0-9]+(?:,[0-9]+)*', # ROLE cannot be made up of COLON + SYMBOL because it then becomes # difficult to detect anonymous roles: (a : b) vs (a :b c) - 'ROLE': r':[^ \t\r\n\v\f()\/:~]*', - 'SYMBOL': r'[^ \t\r\n\v\f()\/:~]+', + 'ROLE': r':[^ \t\r\n\v\f"()\/:~]*', + 'SYMBOL': r'[^ \t\r\n\v\f"()\/:~]+', 'LPAREN': r'\(', 'RPAREN': r'\)', 'SLASH': r'\/', # concept (node label) role diff --git a/penman/codec.py b/penman/codec.py index 7e147ad..402c435 100644 --- a/penman/codec.py +++ b/penman/codec.py @@ -3,6 +3,7 @@ """ Serialization of PENMAN graphs. """ + from pathlib import Path from typing import IO, Iterable, Iterator, List, Optional, Union diff --git a/pyproject.toml b/pyproject.toml index 85d3a0e..ec85fc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ dependencies = [ ] [tool.hatch.envs.dev.scripts] test = "pytest {args:--doctest-glob=*.rst --doctest-modules --ignore-glob=penman/interface.py}" -lint = "ruff {args} penman/" +lint = "ruff check {args} penman/" typecheck = "mypy penman/" format = "ruff format {args} penman/" @@ -75,6 +75,8 @@ clean = "make -C docs clean" [tool.ruff] target-version = "py38" line-length = 79 + +[tool.ruff.lint] select = [ "B", # flake8-bugbear "C90", # McCabe cyclomatic complexity @@ -84,10 +86,8 @@ select = [ "N", # PEP-8 naming "W", # pycodestyle warnings ] - -[tool.ruff.lint.isort] -combine-as-imports = true -force-wrap-aliases = true +isort.combine-as-imports = true +isort.force-wrap-aliases = true [tool.ruff.format] quote-style = "single" diff --git a/tests/test_codec.py b/tests/test_codec.py index 09e9e7b..b6cc9bc 100644 --- a/tests/test_codec.py +++ b/tests/test_codec.py @@ -229,6 +229,16 @@ def test_decode_recursion_limit(self): assert len(g.triples) == (n # n :instance triples + n - 1) # n - 1 :ARG0 triples + def test_decode_issue_143(self): + # https://github.com/goodmami/penman/issues/143 + with pytest.raises(penman.DecodeError): + decode('(a :op ")') + with pytest.raises(penman.DecodeError): + decode('(a :op1 " :op2 "foo")') + with pytest.raises(penman.DecodeError): + decode('(a :" foo)') + + def test_encode(self, x1): # empty graph g = penman.Graph([]) diff --git a/tests/test_lexer.py b/tests/test_lexer.py index a4bcb69..fab4e20 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -74,3 +74,18 @@ def test_nonbreaking_space_issue_99(): assert [tok.type for tok in lexer.lex('1\r2')] == ['SYMBOL', 'SYMBOL'] assert [tok.type for tok in lexer.lex('1\u00a02')] == ['SYMBOL'] assert [tok.type for tok in lexer.lex('あ い')] == ['SYMBOL'] + + +def test_unterminated_string_issue_143(): + # https://github.com/goodmami/penman/issues/143 + # unmatched quotes result in unexpected tokens + assert [tok.type for tok in lexer.lex('(a :op ")')] == [ + 'LPAREN', 'SYMBOL', 'ROLE', 'UNEXPECTED', 'RPAREN' + ] + assert [tok.type for tok in lexer.lex('(a :op1 " :op2 "foo")')] == [ + 'LPAREN', 'SYMBOL', 'ROLE', 'STRING', 'SYMBOL', 'UNEXPECTED', 'RPAREN' + ] + # also disallow quotes in role names + assert [tok.type for tok in lexer.lex('(a :" b)')] == [ + 'LPAREN', 'SYMBOL', 'ROLE', 'UNEXPECTED', 'SYMBOL', 'RPAREN' + ]