Merge branch 'error-handling' of https://github.com/MegaIng/lark into MegaIng-error-handling

4 years ago · c19c803340
--- a/lark-stubs/exceptions.pyi
+++ b/lark-stubs/exceptions.pyi
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-

 from typing import Dict, Iterable, Callable, Union
 from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set
 from .tree import Tree
 from .lexer import Token

 from .parsers.lalr_puppet import ParserPuppet

 class LarkError(Exception):
    pass
@@ -21,27 +21,37 @@ class LexError(LarkError):
    pass


 T = TypeVar('T')


 class UnexpectedInput(LarkError):
    line: int
    column: int
    pos_in_stream: int
    state: Any

    def get_context(self, text: str, span: int = ...):
        ...

    def match_examples(
        self,
        parse_fn: Callable[[str], Tree],
        examples: Dict[str, Iterable[str]]
    ):
            self,
            parse_fn: Callable[[str], Tree],
            examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]],
            token_type_match_fallback: bool = False,
            use_accepts: bool = False,
    ) -> T:
        ...


 class UnexpectedToken(ParseError, UnexpectedInput):
    pass

    expected: Set[str]
    considered_rules: Set[str]
    puppet: ParserPuppet
    accepts: Set[str]

 class UnexpectedCharacters(LexError, UnexpectedInput):
    line: int
    column: int
    allowed: Set[str]
    considered_tokens: Set[Any]


 class VisitError(LarkError):
--- a/lark-stubs/parsers/init.pyi
+++ b/lark-stubs/parsers/init.pyi
--- a/lark-stubs/parsers/lalr_puppet.pyi
+++ b/lark-stubs/parsers/lalr_puppet.pyi
@@ -0,0 +1,21 @@
 from typing import Set, Dict, Any

 from lark import Token, Tree


 class ParserPuppet(object):
    """
    Represents a LalrParser that can be step through.
    Shouldn't instantiated by hand, but is accessible as `UnexpectedToken.puppet`
    """
    def feed_token(self, token: Token): ...

    def copy(self) -> ParserPuppet: ...

    def pretty(self) -> str: ...

    def choices(self) -> Dict[str, Any]: ...

    def accepts(self) -> Set[str]: ...

    def resume_parse(self) -> Tree: ...
--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -1,3 +1,5 @@
 import logging

 from .utils import STRING_TYPE

 ###{standalone
@@ -37,36 +39,44 @@ class UnexpectedInput(LarkError):
            after = text[pos:end].split(b'\n', 1)[0]
            return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace")

    def match_examples(self, parse_fn, examples, token_type_match_fallback=False):
    def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False):
        """ Given a parser instance and a dictionary mapping some label with
            some malformed syntax examples, it'll return the label for the
            example that bests matches the current error.
        """
        assert self.state is not None, "Not supported for this exception"
        
        if isinstance(examples, dict):
            examples = examples.items()

        candidate = (None, False)
        for label, example in examples.items():
        for i, (label, example) in enumerate(examples):
            assert not isinstance(example, STRING_TYPE)

            for malformed in example:
            for j, malformed in enumerate(example):
                try:
                    parse_fn(malformed)
                except UnexpectedInput as ut:
                    if ut.state == self.state:
                    if ut.state == self.state and (not use_accepts or ut.accepts == self.accepts):
                        try:
                            if ut.token == self.token:  # Try exact match first
                                logging.debug("Exact Match at example [%s][%s]" % (i, j))
                                return label

                            if token_type_match_fallback:
                                # Fallback to token types match
                                if (ut.token.type == self.token.type) and not candidate[-1]:
                                    logging.debug("Token Type Fallback at example [%s][%s]" % (i, j))
                                    candidate = label, True

                        except AttributeError:
                            pass
                        if not candidate[0]:
                            logging.debug("Same State match at example [%s][%s]" % (i, j))
                            candidate = label, False

                    elif ut.state == self.state:
                        logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
                                      (self.state, self.accepts, ut.accepts, i, j))
        return candidate[0]


@@ -96,7 +106,7 @@ class UnexpectedCharacters(LexError, UnexpectedInput):


 class UnexpectedToken(ParseError, UnexpectedInput):
    def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
    def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, accepts=None):
        self.token = token
        self.expected = expected     # XXX str shouldn't necessary
        self.line = getattr(token, 'line', '?')
@@ -105,10 +115,11 @@ class UnexpectedToken(ParseError, UnexpectedInput):
        self.state = state
        self.pos_in_stream = getattr(token, 'pos_in_stream', None)
        self.puppet = puppet
        self.accepts = accepts

        message = ("Unexpected token %r at line %s, column %s.\n"
                   "Expected one of: \n\t* %s\n"
                   % (token, self.line, self.column, '\n\t* '.join(self.expected)))
                   % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))

        super(UnexpectedToken, self).__init__(message)

--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -85,7 +85,7 @@ TERMINALS = {
    'RULE': '!?[_?]?[a-z][_a-z0-9]*',
    'TERMINAL': '_?[A-Z][_A-Z0-9]*',
    'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
    'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
    'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS,
    '_NL': r'(\r?\n)+\s*',
    'WS': r'[ \t]+',
    'COMMENT': r'\s*//[^\n]*',
@@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
                    term_name = None

        elif isinstance(p, PatternRE):
            if p in self.term_reverse: # Kind of a wierd placement.name
            if p in self.term_reverse: # Kind of a weird placement.name
                term_name = self.term_reverse[p].name
        else:
            assert False, p
@@ -409,6 +409,13 @@ def _literal_to_pattern(literal):
    flags = v[flag_start:]
    assert all(f in _RE_FLAGS for f in flags), flags

    if literal.type == 'STRING' and '\n' in v:
        raise GrammarError('You cannot put newlines in string literals')

    if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags:
        raise GrammarError('You can only use newlines in regular expressions '
                           'with the `x` (verbose) flag')

    v = v[:flag_start]
    assert v[0] == v[-1] and v[0] in '"/'
    x = v[1:-1]
@@ -417,9 +424,11 @@ def _literal_to_pattern(literal):

    if literal.type == 'STRING':
        s = s.replace('\\\\', '\\')

    return { 'STRING': PatternStr,
             'REGEXP': PatternRE }[literal.type](s, flags)
        return PatternStr(s, flags)
    elif literal.type == 'REGEXP':
        return PatternRE(s, flags)
    else:
        assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]'


@inline_args
@@ -841,7 +850,7 @@ class GrammarLoader:
                if len(stmt.children) > 1:
                    path_node, arg1 = stmt.children
                else:
                    path_node, = stmt.children
                    path_node ,= stmt.children
                    arg1 = None

                if isinstance(arg1, Tree):  # Multi import
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -59,12 +59,13 @@ class _Parser:
            try:
                return states[state][token.type]
            except KeyError:
                expected = [s for s in states[state].keys() if s.isupper()]
                expected = {s for s in states[state].keys() if s.isupper()}
                try:
                    puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
                    accepts = puppet.accepts()
                except NameError:
                    puppet = None
                raise UnexpectedToken(token, expected, state=state, puppet=puppet)
                    puppet = accepts = None
                raise UnexpectedToken(token, expected, state=state, puppet=puppet, accepts=accepts)

        def reduce(rule):
            size = len(rule.expansion)
--- a/lark/parsers/lalr_puppet.py
+++ b/lark/parsers/lalr_puppet.py
@@ -3,8 +3,10 @@
 from copy import deepcopy

 from .lalr_analysis import Shift, Reduce
 from .. import Token

 class ParserPuppet:

 class ParserPuppet(object):
    def __init__(self, parser, state_stack, value_stack, start, stream, set_state):
        self.parser = parser
        self._state_stack = state_stack
@@ -16,7 +18,7 @@ class ParserPuppet:
        self.result = None

    def feed_token(self, token):
        """Advance the parser state, as if it just recieved `token` from the lexer
        """Advance the parser state, as if it just received `token` from the lexer

        """
        end_state = self.parser.parse_table.end_states[self._start]
@@ -66,14 +68,27 @@ class ParserPuppet:
            self._set_state,
        )

    def pretty():
        print("Puppet choices:")
        for k, v in self.choices.items():
            print('\t-', k, '->', v)
        print('stack size:', len(self._state_stack))
    def pretty(self):
        out = ["Puppet choices:"]
        for k, v in self.choices().items():
            out.append('\t- %s -> %s' % (k, v))
        out.append('stack size: %s' % len(self._state_stack))
        return '\n'.join(out)

    def choices(self):
        return self.parser.parse_table.states[self._state_stack[-1]]

    def accepts(self):
        accepts = set()
        for t in self.choices():
            new_puppet = self.copy()
            try:
                new_puppet.feed_token(Token(t, ''))
            except KeyError:
                pass
            else:
                accepts.add(t)
        return accepts

    def resume_parse(self):
        return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack)
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER):
            tree = l.parse('aA')
            self.assertEqual(tree.children, ['a', 'A'])

        def test_token_flags_verbose(self):
            g = _Lark(r"""start: NL | ABC
                          ABC: / [a-z] /x
                          NL: /\n/
                      """)
            x = g.parse('a')
            self.assertEqual(x.children, ['a'])

        def test_token_flags_verbose_multiline(self):
            g = _Lark(r"""start: ABC
                          ABC: /  a      b c
                               d
                                e f
                           /x
                       """)
            x = g.parse('abcdef')
            self.assertEqual(x.children, ['abcdef'])

        def test_token_multiline_only_works_with_x_flag(self):
            g = r"""start: ABC
                    ABC: /  a      b c
                              d
                                e f
                            /i
                      """
            self.assertRaises( GrammarError, _Lark, g)

        @unittest.skipIf(PARSER == 'cyk', "No empty rules")
        def test_twice_empty(self):