From a4ddb1e84f58e892282788a94197cd76bf7d99f9 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Wed, 18 Nov 2020 12:03:13 +0100 Subject: [PATCH 01/13] Adding missing Testcase to `__main__` --- tests/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/__main__.py b/tests/__main__.py index 5ec89e3..1814564 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -9,6 +9,7 @@ from .test_tools import TestStandalone from .test_cache import TestCache from .test_grammar import TestGrammar from .test_reconstructor import TestReconstructor +from .test_tree_forest_transformer import TestTreeForestTransformer try: from .test_nearley.test_nearley import TestNearley From 11a0052eb5b0fef5fcba836f55394c668a90ac3a Mon Sep 17 00:00:00 2001 From: Greg Ward Date: Wed, 18 Nov 2020 14:15:37 -0500 Subject: [PATCH 02/13] Fix incorrect type hint for 'lexer' argument to Lark constructor The code is crystal clear: assert lexer in ('standard', ...) or issubclass(lexer, Lexer) But the type hint said that lexer must be an _instance_ of Lexer, not a subclass. This change fixes it to require a subclass of Lexer. --- lark-stubs/lark.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 8363a5d..7dc8626 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -63,7 +63,7 @@ class Lark: *, start: Union[None, str, List[str]] = "start", parser: Literal["earley", "lalr", "cyk"] = "auto", - lexer: Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Lexer] = "auto", + lexer: Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]] = "auto", transformer: Optional[Transformer] = None, postlex: Optional[PostLex] = None, ambiguity: Literal["explicit", "resolve"] = "resolve", From 68e5e86b5ba84edd3e92f9ba16aca1bb8174e72c Mon Sep 17 00:00:00 2001 From: Greg Ward Date: Wed, 18 Nov 2020 14:34:01 -0500 Subject: [PATCH 03/13] Add missing type hint for Token constructor mypy thinks that Token's __init__ is inherited from __str__(). That's not wrong -- it's just irrelevant, because Token also implements __new__(). Token's _effective_ constructor signature is determined by its __new__() method, so that's what I have used in the type hint. Not clear if 'value' is supposed to be Any, but that's what I need in my application. And it works just fine! --- lark-stubs/lexer.pyi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi index 12d3dfe..a654b0f 100644 --- a/lark-stubs/lexer.pyi +++ b/lark-stubs/lexer.pyi @@ -85,6 +85,9 @@ class Token(str): end_column: int end_pos: int + def __init__(self, type_: str, value: Any, pos_in_stream: int = None, line: int = None, column: int = None, end_line: int = None, end_column: int = None, end_pos: int = None): + ... + def update(self, type_: Optional[str] = None, value: Optional[str] = None) -> Token: ... From 1fc08100860de65709c9e8f533f8a81087206e65 Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Fri, 20 Nov 2020 13:24:55 -0800 Subject: [PATCH 04/13] Fix broken link in README See https://gitter.im/lark-parser/Lobby?at=5fb83369771c185e0eb8c0e2 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d9afc7f..425d4bb 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ Lark is great at handling ambiguity. Here is the result of parsing the phrase "f - MyPy support using type stubs - And much more! -See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features/) +See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features.html) ### Comparison to other libraries From 2e06d4c000d108505944b3863fe4581b40f1e066 Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Fri, 20 Nov 2020 13:28:47 -0800 Subject: [PATCH 05/13] Update README.md Fixed pyparsing link to point to https://github.com/pyparsing/pyparsing --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 425d4bb..2d69420 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail |:--------|:----------|:----|:--------|:------------|:------------|:----------|:---------- | **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | Yes! (LALR only) | | [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No | -| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No | +| [PyParsing](https://github.com/pyparsing/pyparsing) | PEG | Combinators | No | No | No\* | No | No | | [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No | | [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No | | [ANTLR](https://github.com/antlr/antlr4) | LL(*) | EBNF | Yes | No | Yes? | Yes | No | From 4e442bc0b8379fb764b26b46f99d2bf32eb580c4 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Sat, 21 Nov 2020 20:56:14 +0100 Subject: [PATCH 06/13] regression-fix for #760 --- lark-stubs/lexer.pyi | 2 +- lark/lexer.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi index a654b0f..3f246fb 100644 --- a/lark-stubs/lexer.pyi +++ b/lark-stubs/lexer.pyi @@ -139,7 +139,7 @@ class TraditionalLexer(Lexer): def lex(self, stream: str) -> Iterator[Token]: ... - def next_token(self, lex_state: Any) -> Token: + def next_token(self, lex_state: Any, parser_state: Any = None) -> Token: ... class ContextualLexer(Lexer): diff --git a/lark/lexer.py b/lark/lexer.py index 4c420e7..6d69ec9 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -338,12 +338,12 @@ class TraditionalLexer(Lexer): if m: return m.group(0), type_from_index[m.lastindex] - def lex(self, state, _parser_state): + def lex(self, state, parser_state): with suppress(EOFError): while True: - yield self.next_token(state) + yield self.next_token(state, parser_state) - def next_token(self, lex_state): + def next_token(self, lex_state, parser_state=None): line_ctr = lex_state.line_ctr while line_ctr.char_pos < len(lex_state.text): res = self.match(lex_state.text, line_ctr.char_pos) @@ -352,7 +352,8 @@ class TraditionalLexer(Lexer): if not allowed: allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, - allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) + allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], + state=(parser_state and parser_state.position)) value, type_ = res @@ -428,13 +429,13 @@ class ContextualLexer(Lexer): try: while True: lexer = self.lexers[parser_state.position] - yield lexer.next_token(lexer_state) + yield lexer.next_token(lexer_state, parser_state) except EOFError: pass except UnexpectedCharacters as e: # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. # This tests the input against the global context, to provide a nicer error. - token = self.root_lexer.next_token(lexer_state) + token = self.root_lexer.next_token(lexer_state, parser_state) raise UnexpectedToken(token, e.allowed, state=parser_state.position) From b3eb2a31201b3aba6626502ee771885a556438a6 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 21 Nov 2020 22:51:22 +0200 Subject: [PATCH 07/13] Add token_history to UnexpectedToken --- lark/exceptions.py | 6 +++++- lark/lexer.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 8444a65..ed7b9c7 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -147,7 +147,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): see: :ref:`ParserPuppet`. """ - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None): self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') self.pos_in_stream = getattr(token, 'pos_in_stream', None) @@ -157,6 +157,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.expected = expected # XXX deprecate? `accepts` is better self.considered_rules = considered_rules self.puppet = puppet + self.token_history = token_history # TODO Only calculate `accepts()` when we need to display it to the user # This will improve performance when doing automatic error handling @@ -166,6 +167,9 @@ class UnexpectedToken(ParseError, UnexpectedInput): "Expected one of: \n\t* %s\n" % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) + if self.token_history: + message += "Previous tokens: %r\n" % token_history + super(UnexpectedToken, self).__init__(message) diff --git a/lark/lexer.py b/lark/lexer.py index 6d69ec9..8be8acd 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -436,7 +436,7 @@ class ContextualLexer(Lexer): # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. # This tests the input against the global context, to provide a nicer error. token = self.root_lexer.next_token(lexer_state, parser_state) - raise UnexpectedToken(token, e.allowed, state=parser_state.position) + raise UnexpectedToken(token, e.allowed, state=parser_state.position, token_history=[lexer_state.last_token]) class LexerThread: From f285cda4f25ae9f459bc772a682f2f384bacddd2 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 16 Nov 2020 17:19:26 +0200 Subject: [PATCH 08/13] Earley error reporting - initial (Issue #760) --- examples/advanced/error_reporting_earley.py | 79 +++++++++++++++++++++ examples/advanced/error_reporting_lalr.py | 2 +- lark/__init__.py | 2 +- lark/exceptions.py | 24 ++++--- lark/parsers/earley.py | 2 +- lark/parsers/xearley.py | 2 +- 6 files changed, 98 insertions(+), 13 deletions(-) create mode 100644 examples/advanced/error_reporting_earley.py diff --git a/examples/advanced/error_reporting_earley.py b/examples/advanced/error_reporting_earley.py new file mode 100644 index 0000000..f0bcc20 --- /dev/null +++ b/examples/advanced/error_reporting_earley.py @@ -0,0 +1,79 @@ +""" +Example-Driven Error Reporting +============================== + +A demonstration of example-driven error reporting with the Earley parser +(See also: error_reporting_lalr.py) +""" +from lark import Lark, UnexpectedInput + +from _json_parser import json_grammar # Using the grammar from the json_parser example + +json_parser = Lark(json_grammar) + +class JsonSyntaxError(SyntaxError): + def __str__(self): + context, line, column = self.args + return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context) + +class JsonMissingValue(JsonSyntaxError): + label = 'Missing Value' + +class JsonMissingOpening(JsonSyntaxError): + label = 'Missing Opening' + +class JsonMissingClosing(JsonSyntaxError): + label = 'Missing Closing' + +class JsonMissingComma(JsonSyntaxError): + label = 'Missing Comma' + +class JsonTrailingComma(JsonSyntaxError): + label = 'Trailing Comma' + + +def parse(json_text): + try: + j = json_parser.parse(json_text) + except UnexpectedInput as u: + exc_class = u.match_examples(json_parser.parse, { + JsonMissingOpening: ['{"foo": ]}', + '{"foor": }}', + '{"foo": }'], + JsonMissingClosing: ['{"foo": [}', + '{', + '{"a": 1', + '[1'], + JsonMissingComma: ['[1 2]', + '[false 1]', + '["b" 1]', + '{"a":true 1:4}', + '{"a":1 1:4}', + '{"a":"b" 1:4}'], + JsonTrailingComma: ['[,]', + '[1,]', + '[1,2,]', + '{"foo":1,}', + '{"foo":false,"bar":true,}'] + }, use_accepts=True) + if not exc_class: + raise + raise exc_class(u.get_context(json_text), u.line, u.column) + + +def test(): + try: + parse('{"example1": "value"') + except JsonMissingClosing as e: + print(e) + + try: + parse('{"example2": ] ') + except JsonMissingOpening as e: + print(e) + + +if __name__ == '__main__': + test() + + diff --git a/examples/advanced/error_reporting_lalr.py b/examples/advanced/error_reporting_lalr.py index 102f7b1..c2cb239 100644 --- a/examples/advanced/error_reporting_lalr.py +++ b/examples/advanced/error_reporting_lalr.py @@ -3,7 +3,7 @@ Example-Driven Error Reporting ============================== A demonstration of example-driven error reporting with the LALR parser - +(See also: error_reporting_earley.py) """ from lark import Lark, UnexpectedInput diff --git a/lark/__init__.py b/lark/__init__.py index 814fe66..168a969 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -3,7 +3,7 @@ from .tree import Tree from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive from .visitors import InlineTransformer, inline_args # XXX Deprecated from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, - UnexpectedInput, UnexpectedCharacters, LarkError) + UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError) from .lexer import Token from .lark import Lark diff --git a/lark/exceptions.py b/lark/exceptions.py index ed7b9c7..ab4b139 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -19,14 +19,6 @@ class LexError(LarkError): pass -class UnexpectedEOF(ParseError): - def __init__(self, expected): - self.expected = expected - - message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) - super(UnexpectedEOF, self).__init__(message) - - class UnexpectedInput(LarkError): """UnexpectedInput Error. @@ -47,6 +39,7 @@ class UnexpectedInput(LarkError): The parser doesn't hold a copy of the text it has to parse, so you have to provide it again """ + assert self.pos_in_stream is not None, self pos = self.pos_in_stream start = max(pos - span, 0) end = pos + span @@ -91,7 +84,7 @@ class UnexpectedInput(LarkError): parse_fn(malformed) except UnexpectedInput as ut: if ut.state == self.state: - if use_accepts and ut.accepts != self.accepts: + if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % (self.state, self.accepts, ut.accepts, i, j)) continue @@ -114,6 +107,19 @@ class UnexpectedInput(LarkError): return candidate[0] +class UnexpectedEOF(ParseError, UnexpectedInput): + def __init__(self, expected, state=None): + self.expected = expected + self.state = state + from .lexer import Token + self.token = Token("", "") #, line=-1, column=-1, pos_in_stream=-1) + self.pos_in_stream = -1 + self.line = -1 + self.column = -1 + + message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) + super(UnexpectedEOF, self).__init__(message) + class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index f0bb7f5..aa18371 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -299,7 +299,7 @@ class Parser: solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: expected_terminals = [t.expect for t in to_scan] - raise UnexpectedEOF(expected_terminals) + raise UnexpectedEOF(expected_terminals, state={i.s for i in to_scan}) if self.debug: from .earley_forest import ForestToPyDotVisitor diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 256fc2c..ae98f0f 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -113,7 +113,7 @@ class Parser(BaseParser): del delayed_matches[i+1] # No longer needed, so unburden memory if not next_set and not delayed_matches and not next_to_scan: - raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan)) + raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan), state={i.s for i in next_to_scan}) return next_to_scan From 1aff84391a416cd28cd086dd78ff4b08058b9884 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 21 Nov 2020 23:08:58 +0200 Subject: [PATCH 09/13] Added test for match_examples --- lark/exceptions.py | 2 +- tests/test_parser.py | 28 ++++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index ab4b139..92ac019 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -101,7 +101,7 @@ class UnexpectedInput(LarkError): except AttributeError: pass - if not candidate[0]: + if candidate[0] is None: logger.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False diff --git a/tests/test_parser.py b/tests/test_parser.py index 39bd00c..edb4b26 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -323,7 +323,7 @@ class TestParsers(unittest.TestCase): def test_alias(self): Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """) - + def test_backwards_custom_lexer(self): class OldCustomLexer(Lexer): def __init__(self, lexer_conf): @@ -331,12 +331,12 @@ class TestParsers(unittest.TestCase): def lex(self, text): yield Token('A', 'A') - + p = Lark(""" start: A %declare A """, parser='lalr', lexer=OldCustomLexer) - + r = p.parse('') self.assertEqual(r, Tree('start', [Token('A', 'A')])) @@ -866,7 +866,7 @@ class CustomLexer(Lexer): self.lexer = TraditionalLexer(copy(lexer_conf)) def lex(self, *args, **kwargs): return self.lexer.lex(*args, **kwargs) - + __future_interface__ = True def _tree_structure_check(a, b): @@ -2342,6 +2342,26 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(a.line, 1) self.assertEqual(b.line, 2) + @unittest.skipIf(LEXER=='standard' and PARSER!='lalr', "Puppet error handling only works with LALR for now") + def test_match_examples(self): + p = _Lark(r""" + start: "a" "b" "c" + """) + + def match_error(s): + try: + _ = p.parse(s) + except UnexpectedInput as u: + return u.match_examples(p.parse, { + 0: ['abe'], + 1: ['ab'], + }) + assert False + + assert match_error("abe") == 0 + assert match_error("ab") == 1 + + @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_class(self): "Tests that character classes from the `regex` module work correctly." From 7fa993320eebf8a27c28657f4c6d15b1bb210cd6 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 22 Nov 2020 10:04:48 +0200 Subject: [PATCH 10/13] match_examples() now works for Earley+Standard Note: This refactor opens the door for implementing a ContextualLexer for Earley. But unlike the existing one for LALR, it will have to be computed at runtime, rather than ahead of time. --- lark/exceptions.py | 2 ++ lark/lexer.py | 4 ++-- lark/parser_frontends.py | 3 --- lark/parsers/earley.py | 16 ++++++++++------ lark/parsers/lalr_parser.py | 10 ++++++++-- lark/parsers/xearley.py | 3 ++- lark/tree_matcher.py | 10 +++++++++- tests/test_parser.py | 6 +++++- 8 files changed, 38 insertions(+), 16 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 92ac019..1d63561 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -123,6 +123,7 @@ class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): + # TODO considered_tokens and allowed can be figured out using state self.line = line self.column = column self.pos_in_stream = lex_pos @@ -154,6 +155,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): see: :ref:`ParserPuppet`. """ def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None): + # TODO considered_tokens and allowed can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') self.pos_in_stream = getattr(token, 'pos_in_stream', None) diff --git a/lark/lexer.py b/lark/lexer.py index 8be8acd..bda8497 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -353,7 +353,7 @@ class TraditionalLexer(Lexer): allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], - state=(parser_state and parser_state.position)) + state=parser_state) value, type_ = res @@ -436,7 +436,7 @@ class ContextualLexer(Lexer): # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. # This tests the input against the global context, to provide a nicer error. token = self.root_lexer.next_token(lexer_state, parser_state) - raise UnexpectedToken(token, e.allowed, state=parser_state.position, token_history=[lexer_state.last_token]) + raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token]) class LexerThread: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 337ddeb..abc0fba 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -173,9 +173,6 @@ class Earley(WithLexer): tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) - def make_lexer(self, text): - return WithLexer.make_lexer(self, text).lex(None) - def match(self, term, token): return term.name == token.type diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index aa18371..e4a220a 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -146,7 +146,7 @@ class Parser: column.add(new_item) items.append(new_item) - def _parse(self, stream, columns, to_scan, start_symbol=None): + def _parse(self, lexer, columns, to_scan, start_symbol=None): def is_quasi_complete(item): if item.is_complete: return True @@ -245,7 +245,7 @@ class Parser: if not next_set and not next_to_scan: expect = {i.expect.name for i in to_scan} - raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) + raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.expect for i in to_scan)) return next_to_scan @@ -261,20 +261,24 @@ class Parser: # Completions will be added to the SPPF tree, and predictions will be recursively # processed down to terminals/empty nodes to be added to the scanner for the next # step. + expects = {i.expect for i in to_scan} i = 0 - for token in stream: + for token in lexer.lex(expects): self.predict_and_complete(i, to_scan, columns, transitives) to_scan = scan(i, token, to_scan) i += 1 + expects.clear() + expects |= {i.expect for i in to_scan} + self.predict_and_complete(i, to_scan, columns, transitives) ## Column is now the final column in the parse. assert i == len(columns)-1 return to_scan - def parse(self, stream, start): + def parse(self, lexer, start): assert start, start start_symbol = NonTerminal(start) @@ -291,7 +295,7 @@ class Parser: else: columns[0].add(item) - to_scan = self._parse(stream, columns, to_scan, start_symbol) + to_scan = self._parse(lexer, columns, to_scan, start_symbol) # If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in @@ -299,7 +303,7 @@ class Parser: solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: expected_terminals = [t.expect for t in to_scan] - raise UnexpectedEOF(expected_terminals, state={i.s for i in to_scan}) + raise UnexpectedEOF(expected_terminals, state=frozenset(i.expect for i in to_scan)) if self.debug: from .earley_forest import ForestToPyDotVisitor diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index e8c4432..3d006e7 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -3,7 +3,7 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com from copy import deepcopy, copy -from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken +from ..exceptions import UnexpectedInput, UnexpectedToken from ..lexer import Token from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable @@ -62,6 +62,12 @@ class ParserState: def position(self): return self.state_stack[-1] + # Necessary for match_examples() to work + def __eq__(self, other): + if not isinstance(other, ParserState): + return False + return self.position == other.position + def __copy__(self): return type(self)( self.parse_conf, @@ -86,7 +92,7 @@ class ParserState: action, arg = states[state][token.type] except KeyError: expected = {s for s in states[state].keys() if s.isupper()} - raise UnexpectedToken(token, expected, state=state, puppet=None) + raise UnexpectedToken(token, expected, state=self, puppet=None) assert arg != end_state diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index ae98f0f..cf9b6ec 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -113,7 +113,8 @@ class Parser(BaseParser): del delayed_matches[i+1] # No longer needed, so unburden memory if not next_set and not delayed_matches and not next_to_scan: - raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan), state={i.s for i in next_to_scan}) + raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, + set(to_scan), state=frozenset(i.expect for i in to_scan)) return next_to_scan diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py index 8c1f17a..c9d9fde 100644 --- a/lark/tree_matcher.py +++ b/lark/tree_matcher.py @@ -69,6 +69,14 @@ def parse_rulename(s): return name, args + +class ChildrenLexer: + def __init__(self, children): + self.children = children + + def lex(self, parser_state): + return self.children + class TreeMatcher: """Match the elements of a tree node, based on an ontology provided by a Lark grammar. @@ -173,6 +181,6 @@ class TreeMatcher: self._parser_cache[rulename] = parser # find a full derivation - unreduced_tree = parser.parse(tree.children, rulename) + unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename) assert unreduced_tree.data == rulename return unreduced_tree diff --git a/tests/test_parser.py b/tests/test_parser.py index edb4b26..863bf5d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2342,7 +2342,7 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(a.line, 1) self.assertEqual(b.line, 2) - @unittest.skipIf(LEXER=='standard' and PARSER!='lalr', "Puppet error handling only works with LALR for now") + @unittest.skipIf(PARSER=='cyk', "match_examples() not supported for CYK") def test_match_examples(self): p = _Lark(r""" start: "a" "b" "c" @@ -2355,11 +2355,15 @@ def _make_parser_test(LEXER, PARSER): return u.match_examples(p.parse, { 0: ['abe'], 1: ['ab'], + 2: ['cbc'], }) assert False assert match_error("abe") == 0 assert match_error("ab") == 1 + assert match_error("bbc") == 2 + assert match_error("cbc") == 2 + self.assertEqual( match_error("dbc"), 2 ) @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') From e6bbfd16c0e50a7f20a51ff8edb3bf0797a68594 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 23 Nov 2020 10:24:44 +0200 Subject: [PATCH 11/13] Fixed comment --- lark/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 1d63561..44f8cbb 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -155,7 +155,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): see: :ref:`ParserPuppet`. """ def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None): - # TODO considered_tokens and allowed can be figured out using state + # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') self.pos_in_stream = getattr(token, 'pos_in_stream', None) From 70c233e3010ffbcb9eaeeec65944006558afcd43 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 23 Nov 2020 10:48:18 +0200 Subject: [PATCH 12/13] Update links in pypi (Issue #714) --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 382943e..b3897c5 100644 --- a/setup.py +++ b/setup.py @@ -29,8 +29,8 @@ setup( description = "a modern parsing library", license = "MIT", keywords = "Earley LALR parser parsing ast", - url = "https://github.com/erezsh/lark", - download_url = "https://github.com/erezsh/lark/tarball/master", + url = "https://github.com/lark-parser/lark", + download_url = "https://github.com/lark-parser/lark/tarball/master", long_description=''' Lark is a modern general-purpose parsing library for Python. From e6dcc434786a1f3a4e53581673e05c6663fcef16 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 23 Nov 2020 22:44:49 +0200 Subject: [PATCH 13/13] Improve match_examples() for Earley (Issue #760) --- lark/parsers/earley.py | 4 ++-- lark/parsers/xearley.py | 2 +- tests/test_parser.py | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index e4a220a..320b59a 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -245,7 +245,7 @@ class Parser: if not next_set and not next_to_scan: expect = {i.expect.name for i in to_scan} - raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.expect for i in to_scan)) + raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan)) return next_to_scan @@ -303,7 +303,7 @@ class Parser: solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: expected_terminals = [t.expect for t in to_scan] - raise UnexpectedEOF(expected_terminals, state=frozenset(i.expect for i in to_scan)) + raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) if self.debug: from .earley_forest import ForestToPyDotVisitor diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index cf9b6ec..d965421 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -114,7 +114,7 @@ class Parser(BaseParser): if not next_set and not delayed_matches and not next_to_scan: raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, - set(to_scan), state=frozenset(i.expect for i in to_scan)) + set(to_scan), state=frozenset(i.s for i in to_scan)) return next_to_scan diff --git a/tests/test_parser.py b/tests/test_parser.py index 863bf5d..bb807d4 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2355,7 +2355,7 @@ def _make_parser_test(LEXER, PARSER): return u.match_examples(p.parse, { 0: ['abe'], 1: ['ab'], - 2: ['cbc'], + 2: ['cbc', 'dbc'], }) assert False @@ -2364,6 +2364,7 @@ def _make_parser_test(LEXER, PARSER): assert match_error("bbc") == 2 assert match_error("cbc") == 2 self.assertEqual( match_error("dbc"), 2 ) + self.assertEqual( match_error("ebc"), 2 ) @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')