diff --git a/examples/advanced/error_reporting_earley.py b/examples/advanced/error_reporting_earley.py new file mode 100644 index 0000000..f0bcc20 --- /dev/null +++ b/examples/advanced/error_reporting_earley.py @@ -0,0 +1,79 @@ +""" +Example-Driven Error Reporting +============================== + +A demonstration of example-driven error reporting with the Earley parser +(See also: error_reporting_lalr.py) +""" +from lark import Lark, UnexpectedInput + +from _json_parser import json_grammar # Using the grammar from the json_parser example + +json_parser = Lark(json_grammar) + +class JsonSyntaxError(SyntaxError): + def __str__(self): + context, line, column = self.args + return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context) + +class JsonMissingValue(JsonSyntaxError): + label = 'Missing Value' + +class JsonMissingOpening(JsonSyntaxError): + label = 'Missing Opening' + +class JsonMissingClosing(JsonSyntaxError): + label = 'Missing Closing' + +class JsonMissingComma(JsonSyntaxError): + label = 'Missing Comma' + +class JsonTrailingComma(JsonSyntaxError): + label = 'Trailing Comma' + + +def parse(json_text): + try: + j = json_parser.parse(json_text) + except UnexpectedInput as u: + exc_class = u.match_examples(json_parser.parse, { + JsonMissingOpening: ['{"foo": ]}', + '{"foor": }}', + '{"foo": }'], + JsonMissingClosing: ['{"foo": [}', + '{', + '{"a": 1', + '[1'], + JsonMissingComma: ['[1 2]', + '[false 1]', + '["b" 1]', + '{"a":true 1:4}', + '{"a":1 1:4}', + '{"a":"b" 1:4}'], + JsonTrailingComma: ['[,]', + '[1,]', + '[1,2,]', + '{"foo":1,}', + '{"foo":false,"bar":true,}'] + }, use_accepts=True) + if not exc_class: + raise + raise exc_class(u.get_context(json_text), u.line, u.column) + + +def test(): + try: + parse('{"example1": "value"') + except JsonMissingClosing as e: + print(e) + + try: + parse('{"example2": ] ') + except JsonMissingOpening as e: + print(e) + + +if __name__ == '__main__': + test() + + diff --git a/examples/advanced/error_reporting_lalr.py b/examples/advanced/error_reporting_lalr.py index 102f7b1..c2cb239 100644 --- a/examples/advanced/error_reporting_lalr.py +++ b/examples/advanced/error_reporting_lalr.py @@ -3,7 +3,7 @@ Example-Driven Error Reporting ============================== A demonstration of example-driven error reporting with the LALR parser - +(See also: error_reporting_earley.py) """ from lark import Lark, UnexpectedInput diff --git a/lark/__init__.py b/lark/__init__.py index 814fe66..168a969 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -3,7 +3,7 @@ from .tree import Tree from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive from .visitors import InlineTransformer, inline_args # XXX Deprecated from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, - UnexpectedInput, UnexpectedCharacters, LarkError) + UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError) from .lexer import Token from .lark import Lark diff --git a/lark/exceptions.py b/lark/exceptions.py index 8444a65..44f8cbb 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -19,14 +19,6 @@ class LexError(LarkError): pass -class UnexpectedEOF(ParseError): - def __init__(self, expected): - self.expected = expected - - message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) - super(UnexpectedEOF, self).__init__(message) - - class UnexpectedInput(LarkError): """UnexpectedInput Error. @@ -47,6 +39,7 @@ class UnexpectedInput(LarkError): The parser doesn't hold a copy of the text it has to parse, so you have to provide it again """ + assert self.pos_in_stream is not None, self pos = self.pos_in_stream start = max(pos - span, 0) end = pos + span @@ -91,7 +84,7 @@ class UnexpectedInput(LarkError): parse_fn(malformed) except UnexpectedInput as ut: if ut.state == self.state: - if use_accepts and ut.accepts != self.accepts: + if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % (self.state, self.accepts, ut.accepts, i, j)) continue @@ -108,15 +101,29 @@ class UnexpectedInput(LarkError): except AttributeError: pass - if not candidate[0]: + if candidate[0] is None: logger.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False return candidate[0] +class UnexpectedEOF(ParseError, UnexpectedInput): + def __init__(self, expected, state=None): + self.expected = expected + self.state = state + from .lexer import Token + self.token = Token("", "") #, line=-1, column=-1, pos_in_stream=-1) + self.pos_in_stream = -1 + self.line = -1 + self.column = -1 + + message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) + super(UnexpectedEOF, self).__init__(message) + class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): + # TODO considered_tokens and allowed can be figured out using state self.line = line self.column = column self.pos_in_stream = lex_pos @@ -147,7 +154,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): see: :ref:`ParserPuppet`. """ - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None): + # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') self.pos_in_stream = getattr(token, 'pos_in_stream', None) @@ -157,6 +165,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.expected = expected # XXX deprecate? `accepts` is better self.considered_rules = considered_rules self.puppet = puppet + self.token_history = token_history # TODO Only calculate `accepts()` when we need to display it to the user # This will improve performance when doing automatic error handling @@ -166,6 +175,9 @@ class UnexpectedToken(ParseError, UnexpectedInput): "Expected one of: \n\t* %s\n" % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) + if self.token_history: + message += "Previous tokens: %r\n" % token_history + super(UnexpectedToken, self).__init__(message) diff --git a/lark/lexer.py b/lark/lexer.py index 6d69ec9..bda8497 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -353,7 +353,7 @@ class TraditionalLexer(Lexer): allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], - state=(parser_state and parser_state.position)) + state=parser_state) value, type_ = res @@ -436,7 +436,7 @@ class ContextualLexer(Lexer): # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. # This tests the input against the global context, to provide a nicer error. token = self.root_lexer.next_token(lexer_state, parser_state) - raise UnexpectedToken(token, e.allowed, state=parser_state.position) + raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token]) class LexerThread: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 337ddeb..abc0fba 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -173,9 +173,6 @@ class Earley(WithLexer): tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) - def make_lexer(self, text): - return WithLexer.make_lexer(self, text).lex(None) - def match(self, term, token): return term.name == token.type diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index f0bb7f5..e4a220a 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -146,7 +146,7 @@ class Parser: column.add(new_item) items.append(new_item) - def _parse(self, stream, columns, to_scan, start_symbol=None): + def _parse(self, lexer, columns, to_scan, start_symbol=None): def is_quasi_complete(item): if item.is_complete: return True @@ -245,7 +245,7 @@ class Parser: if not next_set and not next_to_scan: expect = {i.expect.name for i in to_scan} - raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) + raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.expect for i in to_scan)) return next_to_scan @@ -261,20 +261,24 @@ class Parser: # Completions will be added to the SPPF tree, and predictions will be recursively # processed down to terminals/empty nodes to be added to the scanner for the next # step. + expects = {i.expect for i in to_scan} i = 0 - for token in stream: + for token in lexer.lex(expects): self.predict_and_complete(i, to_scan, columns, transitives) to_scan = scan(i, token, to_scan) i += 1 + expects.clear() + expects |= {i.expect for i in to_scan} + self.predict_and_complete(i, to_scan, columns, transitives) ## Column is now the final column in the parse. assert i == len(columns)-1 return to_scan - def parse(self, stream, start): + def parse(self, lexer, start): assert start, start start_symbol = NonTerminal(start) @@ -291,7 +295,7 @@ class Parser: else: columns[0].add(item) - to_scan = self._parse(stream, columns, to_scan, start_symbol) + to_scan = self._parse(lexer, columns, to_scan, start_symbol) # If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in @@ -299,7 +303,7 @@ class Parser: solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: expected_terminals = [t.expect for t in to_scan] - raise UnexpectedEOF(expected_terminals) + raise UnexpectedEOF(expected_terminals, state=frozenset(i.expect for i in to_scan)) if self.debug: from .earley_forest import ForestToPyDotVisitor diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index e8c4432..3d006e7 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -3,7 +3,7 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com from copy import deepcopy, copy -from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken +from ..exceptions import UnexpectedInput, UnexpectedToken from ..lexer import Token from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable @@ -62,6 +62,12 @@ class ParserState: def position(self): return self.state_stack[-1] + # Necessary for match_examples() to work + def __eq__(self, other): + if not isinstance(other, ParserState): + return False + return self.position == other.position + def __copy__(self): return type(self)( self.parse_conf, @@ -86,7 +92,7 @@ class ParserState: action, arg = states[state][token.type] except KeyError: expected = {s for s in states[state].keys() if s.isupper()} - raise UnexpectedToken(token, expected, state=state, puppet=None) + raise UnexpectedToken(token, expected, state=self, puppet=None) assert arg != end_state diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 256fc2c..cf9b6ec 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -113,7 +113,8 @@ class Parser(BaseParser): del delayed_matches[i+1] # No longer needed, so unburden memory if not next_set and not delayed_matches and not next_to_scan: - raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan)) + raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, + set(to_scan), state=frozenset(i.expect for i in to_scan)) return next_to_scan diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py index 8c1f17a..c9d9fde 100644 --- a/lark/tree_matcher.py +++ b/lark/tree_matcher.py @@ -69,6 +69,14 @@ def parse_rulename(s): return name, args + +class ChildrenLexer: + def __init__(self, children): + self.children = children + + def lex(self, parser_state): + return self.children + class TreeMatcher: """Match the elements of a tree node, based on an ontology provided by a Lark grammar. @@ -173,6 +181,6 @@ class TreeMatcher: self._parser_cache[rulename] = parser # find a full derivation - unreduced_tree = parser.parse(tree.children, rulename) + unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename) assert unreduced_tree.data == rulename return unreduced_tree diff --git a/tests/test_parser.py b/tests/test_parser.py index 39bd00c..863bf5d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -323,7 +323,7 @@ class TestParsers(unittest.TestCase): def test_alias(self): Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """) - + def test_backwards_custom_lexer(self): class OldCustomLexer(Lexer): def __init__(self, lexer_conf): @@ -331,12 +331,12 @@ class TestParsers(unittest.TestCase): def lex(self, text): yield Token('A', 'A') - + p = Lark(""" start: A %declare A """, parser='lalr', lexer=OldCustomLexer) - + r = p.parse('') self.assertEqual(r, Tree('start', [Token('A', 'A')])) @@ -866,7 +866,7 @@ class CustomLexer(Lexer): self.lexer = TraditionalLexer(copy(lexer_conf)) def lex(self, *args, **kwargs): return self.lexer.lex(*args, **kwargs) - + __future_interface__ = True def _tree_structure_check(a, b): @@ -2342,6 +2342,30 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(a.line, 1) self.assertEqual(b.line, 2) + @unittest.skipIf(PARSER=='cyk', "match_examples() not supported for CYK") + def test_match_examples(self): + p = _Lark(r""" + start: "a" "b" "c" + """) + + def match_error(s): + try: + _ = p.parse(s) + except UnexpectedInput as u: + return u.match_examples(p.parse, { + 0: ['abe'], + 1: ['ab'], + 2: ['cbc'], + }) + assert False + + assert match_error("abe") == 0 + assert match_error("ab") == 1 + assert match_error("bbc") == 2 + assert match_error("cbc") == 2 + self.assertEqual( match_error("dbc"), 2 ) + + @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_class(self): "Tests that character classes from the `regex` module work correctly."