Merge branch 'MegaIng-error-handling'

4 years ago · c9ca287e9e
--- a/examples/error_reporting_lalr.py
+++ b/examples/error_reporting_lalr.py
@@ -52,7 +52,7 @@ def parse(json_text):
                                '[1,2,]',
                                '{"foo":1,}',
                                '{"foo":false,"bar":true,}']
        })
        }, use_accepts=True)
        if not exc_class:
            raise
        raise exc_class(u.get_context(json_text), u.line, u.column)
--- a/lark-stubs/exceptions.pyi
+++ b/lark-stubs/exceptions.pyi
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-

 from typing import Dict, Iterable, Callable, Union
 from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set
 from .tree import Tree
 from .lexer import Token

 from .parsers.lalr_puppet import ParserPuppet

 class LarkError(Exception):
    pass
@@ -21,27 +21,37 @@ class LexError(LarkError):
    pass


 T = TypeVar('T')


 class UnexpectedInput(LarkError):
    line: int
    column: int
    pos_in_stream: int
    state: Any

    def get_context(self, text: str, span: int = ...):
        ...

    def match_examples(
        self,
        parse_fn: Callable[[str], Tree],
        examples: Dict[str, Iterable[str]]
    ):
            self,
            parse_fn: Callable[[str], Tree],
            examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]],
            token_type_match_fallback: bool = False,
            use_accepts: bool = False,
    ) -> T:
        ...


 class UnexpectedToken(ParseError, UnexpectedInput):
    pass

    expected: Set[str]
    considered_rules: Set[str]
    puppet: ParserPuppet
    accepts: Set[str]

 class UnexpectedCharacters(LexError, UnexpectedInput):
    line: int
    column: int
    allowed: Set[str]
    considered_tokens: Set[Any]


 class VisitError(LarkError):
--- a/lark-stubs/parsers/init.pyi
+++ b/lark-stubs/parsers/init.pyi
--- a/lark-stubs/parsers/lalr_puppet.pyi
+++ b/lark-stubs/parsers/lalr_puppet.pyi
@@ -0,0 +1,22 @@
 from typing import Set, Dict, Any

 from lark import Token, Tree


 class ParserPuppet(object):
    """
    Provides an interface to interactively step through the parser (LALR(1) only for now)

    Accessible via `UnexpectedToken.puppet` (raised by the parser on token error)
    """
    def feed_token(self, token: Token): ...

    def copy(self) -> ParserPuppet: ...

    def pretty(self) -> str: ...

    def choices(self) -> Dict[str, Any]: ...

    def accepts(self) -> Set[str]: ...

    def resume_parse(self) -> Tree: ...
--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -1,3 +1,5 @@
 import logging

 from .utils import STRING_TYPE

 ###{standalone
@@ -37,34 +39,46 @@ class UnexpectedInput(LarkError):
            after = text[pos:end].split(b'\n', 1)[0]
            return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace")

    def match_examples(self, parse_fn, examples, token_type_match_fallback=False):
    def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False):
        """ Given a parser instance and a dictionary mapping some label with
            some malformed syntax examples, it'll return the label for the
            example that bests matches the current error.

            It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility.
        """
        assert self.state is not None, "Not supported for this exception"

        if isinstance(examples, dict):
            examples = examples.items()

        candidate = (None, False)
        for label, example in examples.items():
        for i, (label, example) in enumerate(examples):
            assert not isinstance(example, STRING_TYPE)

            for malformed in example:
            for j, malformed in enumerate(example):
                try:
                    parse_fn(malformed)
                except UnexpectedInput as ut:
                    if ut.state == self.state:
                        if use_accepts and ut.accepts != self.accepts:
                            logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
                                        (self.state, self.accepts, ut.accepts, i, j))
                            continue
                        try:
                            if ut.token == self.token:  # Try exact match first
                                logging.debug("Exact Match at example [%s][%s]" % (i, j))
                                return label

                            if token_type_match_fallback:
                                # Fallback to token types match
                                if (ut.token.type == self.token.type) and not candidate[-1]:
                                    logging.debug("Token Type Fallback at example [%s][%s]" % (i, j))
                                    candidate = label, True

                        except AttributeError:
                            pass
                        if not candidate[0]:
                            logging.debug("Same State match at example [%s][%s]" % (i, j))
                            candidate = label, False

        return candidate[0]
@@ -72,19 +86,20 @@ class UnexpectedInput(LarkError):

 class UnexpectedCharacters(LexError, UnexpectedInput):
    def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):

        if isinstance(seq, bytes):
            message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column)
        else:
            message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)

        self.line = line
        self.column = column
        self.allowed = allowed
        self.considered_tokens = considered_tokens
        self.pos_in_stream = lex_pos
        self.state = state

        self.allowed = allowed
        self.considered_tokens = considered_tokens

        if isinstance(seq, bytes):
            _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
        else:
            _s = seq[lex_pos]

        message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column)
        message += '\n\n' + self.get_context(seq)
        if allowed:
            message += '\nExpecting: %s\n' % allowed
@@ -97,18 +112,23 @@ class UnexpectedCharacters(LexError, UnexpectedInput):

 class UnexpectedToken(ParseError, UnexpectedInput):
    def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
        self.token = token
        self.expected = expected     # XXX str shouldn't necessary
        self.line = getattr(token, 'line', '?')
        self.column = getattr(token, 'column', '?')
        self.considered_rules = considered_rules
        self.state = state
        self.pos_in_stream = getattr(token, 'pos_in_stream', None)
        self.state = state

        self.token = token
        self.expected = expected     # XXX deprecate? `accepts` is better
        self.considered_rules = considered_rules
        self.puppet = puppet

        # TODO Only calculate `accepts()` when we need to display it to the user
        # This will improve performance when doing automatic error handling
        self.accepts = puppet and puppet.accepts()

        message = ("Unexpected token %r at line %s, column %s.\n"
                   "Expected one of: \n\t* %s\n"
                   % (token, self.line, self.column, '\n\t* '.join(self.expected)))
                   % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))

        super(UnexpectedToken, self).__init__(message)

--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -85,7 +85,7 @@ TERMINALS = {
    'RULE': '!?[_?]?[a-z][_a-z0-9]*',
    'TERMINAL': '_?[A-Z][_A-Z0-9]*',
    'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
    'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
    'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS,
    '_NL': r'(\r?\n)+\s*',
    'WS': r'[ \t]+',
    'COMMENT': r'\s*//[^\n]*',
@@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
                    term_name = None

        elif isinstance(p, PatternRE):
            if p in self.term_reverse: # Kind of a wierd placement.name
            if p in self.term_reverse: # Kind of a weird placement.name
                term_name = self.term_reverse[p].name
        else:
            assert False, p
@@ -409,6 +409,13 @@ def _literal_to_pattern(literal):
    flags = v[flag_start:]
    assert all(f in _RE_FLAGS for f in flags), flags

    if literal.type == 'STRING' and '\n' in v:
        raise GrammarError('You cannot put newlines in string literals')

    if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags:
        raise GrammarError('You can only use newlines in regular expressions '
                           'with the `x` (verbose) flag')

    v = v[:flag_start]
    assert v[0] == v[-1] and v[0] in '"/'
    x = v[1:-1]
@@ -417,9 +424,11 @@ def _literal_to_pattern(literal):

    if literal.type == 'STRING':
        s = s.replace('\\\\', '\\')

    return { 'STRING': PatternStr,
             'REGEXP': PatternRE }[literal.type](s, flags)
        return PatternStr(s, flags)
    elif literal.type == 'REGEXP':
        return PatternRE(s, flags)
    else:
        assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]'


@inline_args
@@ -841,7 +850,7 @@ class GrammarLoader:
                if len(stmt.children) > 1:
                    path_node, arg1 = stmt.children
                else:
                    path_node, = stmt.children
                    path_node ,= stmt.children
                    arg1 = None

                if isinstance(arg1, Tree):  # Multi import
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -59,10 +59,10 @@ class _Parser:
            try:
                return states[state][token.type]
            except KeyError:
                expected = [s for s in states[state].keys() if s.isupper()]
                expected = {s for s in states[state].keys() if s.isupper()}
                try:
                    puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
                except NameError:
                except NameError:   # For standalone parser
                    puppet = None
                raise UnexpectedToken(token, expected, state=state, puppet=puppet)

--- a/lark/parsers/lalr_puppet.py
+++ b/lark/parsers/lalr_puppet.py
@@ -3,8 +3,10 @@
 from copy import deepcopy

 from .lalr_analysis import Shift, Reduce
 from .. import Token

 class ParserPuppet:

 class ParserPuppet(object):
    def __init__(self, parser, state_stack, value_stack, start, stream, set_state):
        self.parser = parser
        self._state_stack = state_stack
@@ -16,7 +18,7 @@ class ParserPuppet:
        self.result = None

    def feed_token(self, token):
        """Advance the parser state, as if it just recieved `token` from the lexer
        """Advance the parser state, as if it just received `token` from the lexer

        """
        end_state = self.parser.parse_table.end_states[self._start]
@@ -66,14 +68,27 @@ class ParserPuppet:
            self._set_state,
        )

    def pretty():
        print("Puppet choices:")
        for k, v in self.choices.items():
            print('\t-', k, '->', v)
        print('stack size:', len(self._state_stack))
    def pretty(self):
        out = ["Puppet choices:"]
        for k, v in self.choices().items():
            out.append('\t- %s -> %s' % (k, v))
        out.append('stack size: %s' % len(self._state_stack))
        return '\n'.join(out)

    def choices(self):
        return self.parser.parse_table.states[self._state_stack[-1]]

    def accepts(self):
        accepts = set()
        for t in self.choices():
            new_puppet = self.copy()
            try:
                new_puppet.feed_token(Token(t, ''))
            except KeyError:
                pass
            else:
                accepts.add(t)
        return accepts

    def resume_parse(self):
        return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack)
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER):
            tree = l.parse('aA')
            self.assertEqual(tree.children, ['a', 'A'])

        def test_token_flags_verbose(self):
            g = _Lark(r"""start: NL | ABC
                          ABC: / [a-z] /x
                          NL: /\n/
                      """)
            x = g.parse('a')
            self.assertEqual(x.children, ['a'])

        def test_token_flags_verbose_multiline(self):
            g = _Lark(r"""start: ABC
                          ABC: /  a      b c
                               d
                                e f
                           /x
                       """)
            x = g.parse('abcdef')
            self.assertEqual(x.children, ['abcdef'])

        def test_token_multiline_only_works_with_x_flag(self):
            g = r"""start: ABC
                    ABC: /  a      b c
                              d
                                e f
                            /i
                      """
            self.assertRaises( GrammarError, _Lark, g)

        @unittest.skipIf(PARSER == 'cyk', "No empty rules")
        def test_twice_empty(self):