Earley error reporting - initial (Issue #760)

5 years ago · f285cda4f2
--- a/examples/advanced/error_reporting_earley.py
+++ b/examples/advanced/error_reporting_earley.py
@@ -0,0 +1,79 @@
 """
 Example-Driven Error Reporting
 ==============================

 A demonstration of example-driven error reporting with the Earley parser
 (See also: error_reporting_lalr.py)
 """
 from lark import Lark, UnexpectedInput

 from _json_parser import json_grammar   # Using the grammar from the json_parser example

 json_parser = Lark(json_grammar)

 class JsonSyntaxError(SyntaxError):
    def __str__(self):
        context, line, column = self.args
        return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context)

 class JsonMissingValue(JsonSyntaxError):
    label = 'Missing Value'

 class JsonMissingOpening(JsonSyntaxError):
    label = 'Missing Opening'

 class JsonMissingClosing(JsonSyntaxError):
    label = 'Missing Closing'

 class JsonMissingComma(JsonSyntaxError):
    label = 'Missing Comma'

 class JsonTrailingComma(JsonSyntaxError):
    label = 'Trailing Comma'


 def parse(json_text):
    try:
        j = json_parser.parse(json_text)
    except UnexpectedInput as u:
        exc_class = u.match_examples(json_parser.parse, {
            JsonMissingOpening: ['{"foo": ]}',
                                 '{"foor": }}',
                                 '{"foo": }'],
            JsonMissingClosing: ['{"foo": [}',
                                 '{',
                                 '{"a": 1',
                                 '[1'],
            JsonMissingComma: ['[1 2]',
                               '[false 1]',
                               '["b" 1]',
                               '{"a":true 1:4}',
                               '{"a":1 1:4}',
                               '{"a":"b" 1:4}'],
            JsonTrailingComma: ['[,]',
                                '[1,]',
                                '[1,2,]',
                                '{"foo":1,}',
                                '{"foo":false,"bar":true,}']
        }, use_accepts=True)
        if not exc_class:
            raise
        raise exc_class(u.get_context(json_text), u.line, u.column)


 def test():
    try:
        parse('{"example1": "value"')
    except JsonMissingClosing as e:
        print(e)

    try:
        parse('{"example2": ] ')
    except JsonMissingOpening as e:
        print(e)


 if __name__ == '__main__':
    test()


--- a/examples/advanced/error_reporting_lalr.py
+++ b/examples/advanced/error_reporting_lalr.py
@@ -3,7 +3,7 @@ Example-Driven Error Reporting
 ==============================

 A demonstration of example-driven error reporting with the LALR parser

 (See also: error_reporting_earley.py)
 """
 from lark import Lark, UnexpectedInput

--- a/lark/init.py
+++ b/lark/init.py
@@ -3,7 +3,7 @@ from .tree import Tree
 from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive
 from .visitors import InlineTransformer, inline_args   # XXX Deprecated
 from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken,
                         UnexpectedInput, UnexpectedCharacters, LarkError)
                         UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError)
 from .lexer import Token
 from .lark import Lark

--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -19,14 +19,6 @@ class LexError(LarkError):
    pass


 class UnexpectedEOF(ParseError):
    def __init__(self, expected):
        self.expected = expected

        message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
        super(UnexpectedEOF, self).__init__(message)


 class UnexpectedInput(LarkError):
    """UnexpectedInput Error.

@@ -47,6 +39,7 @@ class UnexpectedInput(LarkError):
            The parser doesn't hold a copy of the text it has to parse,
            so you have to provide it again
        """
        assert self.pos_in_stream is not None, self
        pos = self.pos_in_stream
        start = max(pos - span, 0)
        end = pos + span
@@ -91,7 +84,7 @@ class UnexpectedInput(LarkError):
                    parse_fn(malformed)
                except UnexpectedInput as ut:
                    if ut.state == self.state:
                        if use_accepts and ut.accepts != self.accepts:
                        if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts:
                            logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
                                        (self.state, self.accepts, ut.accepts, i, j))
                            continue
@@ -114,6 +107,19 @@ class UnexpectedInput(LarkError):

        return candidate[0]

 class UnexpectedEOF(ParseError, UnexpectedInput):
    def __init__(self, expected, state=None):
        self.expected = expected
        self.state = state
        from .lexer import Token
        self.token = Token("<EOF>", "") #, line=-1, column=-1, pos_in_stream=-1)
        self.pos_in_stream = -1
        self.line = -1
        self.column = -1

        message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
        super(UnexpectedEOF, self).__init__(message)


 class UnexpectedCharacters(LexError, UnexpectedInput):
    def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -299,7 +299,7 @@ class Parser:
        solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
        if not solutions:
            expected_terminals = [t.expect for t in to_scan]
            raise UnexpectedEOF(expected_terminals)
            raise UnexpectedEOF(expected_terminals, state={i.s for i in to_scan})

        if self.debug:
            from .earley_forest import ForestToPyDotVisitor
--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -113,7 +113,7 @@ class Parser(BaseParser):
            del delayed_matches[i+1]    # No longer needed, so unburden memory

            if not next_set and not delayed_matches and not next_to_scan:
                raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan))
                raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan), state={i.s for i in next_to_scan})

            return next_to_scan