Merge branch 'master' of https://github.com/lark-parser/lark into earley_custom

Conflicts: tests/test_parser.py
před 3 roky · 26e03b9ff8
--- a/README.md
+++ b/README.md
@@ -106,7 +106,7 @@ Lark is great at handling ambiguity. Here is the result of parsing the phrase "f
 - MyPy support using type stubs
 - And much more!

 See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features/)
 See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features.html)


 ### Comparison to other libraries
@@ -132,7 +132,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail
 |:--------|:----------|:----|:--------|:------------|:------------|:----------|:----------
 | **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | Yes! (LALR only) |
 | [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No |
 | [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No |
 | [PyParsing](https://github.com/pyparsing/pyparsing) | PEG | Combinators | No | No | No\* | No | No |
 | [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No |
 | [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No |
 | [ANTLR](https://github.com/antlr/antlr4) | LL(*) | EBNF | Yes | No | Yes? | Yes | No |
--- a/examples/advanced/error_reporting_earley.py
+++ b/examples/advanced/error_reporting_earley.py
@@ -0,0 +1,79 @@
 """
 Example-Driven Error Reporting
 ==============================

 A demonstration of example-driven error reporting with the Earley parser
 (See also: error_reporting_lalr.py)
 """
 from lark import Lark, UnexpectedInput

 from _json_parser import json_grammar   # Using the grammar from the json_parser example

 json_parser = Lark(json_grammar)

 class JsonSyntaxError(SyntaxError):
    def __str__(self):
        context, line, column = self.args
        return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context)

 class JsonMissingValue(JsonSyntaxError):
    label = 'Missing Value'

 class JsonMissingOpening(JsonSyntaxError):
    label = 'Missing Opening'

 class JsonMissingClosing(JsonSyntaxError):
    label = 'Missing Closing'

 class JsonMissingComma(JsonSyntaxError):
    label = 'Missing Comma'

 class JsonTrailingComma(JsonSyntaxError):
    label = 'Trailing Comma'


 def parse(json_text):
    try:
        j = json_parser.parse(json_text)
    except UnexpectedInput as u:
        exc_class = u.match_examples(json_parser.parse, {
            JsonMissingOpening: ['{"foo": ]}',
                                 '{"foor": }}',
                                 '{"foo": }'],
            JsonMissingClosing: ['{"foo": [}',
                                 '{',
                                 '{"a": 1',
                                 '[1'],
            JsonMissingComma: ['[1 2]',
                               '[false 1]',
                               '["b" 1]',
                               '{"a":true 1:4}',
                               '{"a":1 1:4}',
                               '{"a":"b" 1:4}'],
            JsonTrailingComma: ['[,]',
                                '[1,]',
                                '[1,2,]',
                                '{"foo":1,}',
                                '{"foo":false,"bar":true,}']
        }, use_accepts=True)
        if not exc_class:
            raise
        raise exc_class(u.get_context(json_text), u.line, u.column)


 def test():
    try:
        parse('{"example1": "value"')
    except JsonMissingClosing as e:
        print(e)

    try:
        parse('{"example2": ] ')
    except JsonMissingOpening as e:
        print(e)


 if __name__ == '__main__':
    test()


--- a/examples/advanced/error_reporting_lalr.py
+++ b/examples/advanced/error_reporting_lalr.py
@@ -3,7 +3,7 @@ Example-Driven Error Reporting
 ==============================

 A demonstration of example-driven error reporting with the LALR parser

 (See also: error_reporting_earley.py)
 """
 from lark import Lark, UnexpectedInput

--- a/lark-stubs/lark.pyi
+++ b/lark-stubs/lark.pyi
@@ -63,7 +63,7 @@ class Lark:
        *,
        start: Union[None, str, List[str]] = "start",
        parser: Literal["earley", "lalr", "cyk"] = "auto",
        lexer: Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Lexer] = "auto",
        lexer: Union[Literal["auto", "standard", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]] = "auto",
        transformer: Optional[Transformer] = None,
        postlex: Optional[PostLex] = None,
        ambiguity: Literal["explicit", "resolve"] = "resolve",
--- a/lark-stubs/lexer.pyi
+++ b/lark-stubs/lexer.pyi
@@ -85,6 +85,9 @@ class Token(str):
    end_column: int
    end_pos: int

    def __init__(self, type_: str, value: Any, pos_in_stream: int = None, line: int = None, column: int = None, end_line: int = None, end_column: int = None, end_pos: int = None):
        ...

    def update(self, type_: Optional[str] = None, value: Optional[str] = None) -> Token:
        ...

@@ -136,7 +139,7 @@ class TraditionalLexer(Lexer):
    def lex(self, stream: str) -> Iterator[Token]:
        ...

    def next_token(self, lex_state: Any) -> Token:
    def next_token(self, lex_state: Any, parser_state: Any = None) -> Token:
        ...

 class ContextualLexer(Lexer):
--- a/lark/init.py
+++ b/lark/init.py
@@ -3,7 +3,7 @@ from .tree import Tree
 from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive
 from .visitors import InlineTransformer, inline_args   # XXX Deprecated
 from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken,
                         UnexpectedInput, UnexpectedCharacters, LarkError)
                         UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError)
 from .lexer import Token
 from .lark import Lark

--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -19,14 +19,6 @@ class LexError(LarkError):
    pass


 class UnexpectedEOF(ParseError):
    def __init__(self, expected):
        self.expected = expected

        message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
        super(UnexpectedEOF, self).__init__(message)


 class UnexpectedInput(LarkError):
    """UnexpectedInput Error.

@@ -47,6 +39,7 @@ class UnexpectedInput(LarkError):
            The parser doesn't hold a copy of the text it has to parse,
            so you have to provide it again
        """
        assert self.pos_in_stream is not None, self
        pos = self.pos_in_stream
        start = max(pos - span, 0)
        end = pos + span
@@ -91,7 +84,7 @@ class UnexpectedInput(LarkError):
                    parse_fn(malformed)
                except UnexpectedInput as ut:
                    if ut.state == self.state:
                        if use_accepts and ut.accepts != self.accepts:
                        if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts:
                            logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
                                        (self.state, self.accepts, ut.accepts, i, j))
                            continue
@@ -108,15 +101,29 @@ class UnexpectedInput(LarkError):

                        except AttributeError:
                            pass
                        if not candidate[0]:
                        if candidate[0] is None:
                            logger.debug("Same State match at example [%s][%s]" % (i, j))
                            candidate = label, False

        return candidate[0]

 class UnexpectedEOF(ParseError, UnexpectedInput):
    def __init__(self, expected, state=None):
        self.expected = expected
        self.state = state
        from .lexer import Token
        self.token = Token("<EOF>", "") #, line=-1, column=-1, pos_in_stream=-1)
        self.pos_in_stream = -1
        self.line = -1
        self.column = -1

        message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
        super(UnexpectedEOF, self).__init__(message)


 class UnexpectedCharacters(LexError, UnexpectedInput):
    def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
        # TODO considered_tokens and allowed can be figured out using state
        self.line = line
        self.column = column
        self.pos_in_stream = lex_pos
@@ -147,7 +154,8 @@ class UnexpectedToken(ParseError, UnexpectedInput):

    see: :ref:`ParserPuppet`.
    """
    def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
    def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None):
        # TODO considered_rules and expected can be figured out using state
        self.line = getattr(token, 'line', '?')
        self.column = getattr(token, 'column', '?')
        self.pos_in_stream = getattr(token, 'pos_in_stream', None)
@@ -157,6 +165,7 @@ class UnexpectedToken(ParseError, UnexpectedInput):
        self.expected = expected     # XXX deprecate? `accepts` is better
        self.considered_rules = considered_rules
        self.puppet = puppet
        self.token_history = token_history

        # TODO Only calculate `accepts()` when we need to display it to the user
        # This will improve performance when doing automatic error handling
@@ -166,6 +175,9 @@ class UnexpectedToken(ParseError, UnexpectedInput):
                   "Expected one of: \n\t* %s\n"
                   % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))

        if self.token_history:
            message += "Previous tokens: %r\n" % token_history

        super(UnexpectedToken, self).__init__(message)


--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -338,12 +338,12 @@ class TraditionalLexer(Lexer):
            if m:
                return m.group(0), type_from_index[m.lastindex]

    def lex(self, state, _parser_state):
    def lex(self, state, parser_state):
        with suppress(EOFError):
            while True:
                yield self.next_token(state)
                yield self.next_token(state, parser_state)

    def next_token(self, lex_state):
    def next_token(self, lex_state, parser_state=None):
        line_ctr = lex_state.line_ctr
        while line_ctr.char_pos < len(lex_state.text):
            res = self.match(lex_state.text, line_ctr.char_pos)
@@ -352,7 +352,8 @@ class TraditionalLexer(Lexer):
                if not allowed:
                    allowed = {"<END-OF-FILE>"}
                raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
                                           allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token])
                                           allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
                                           state=parser_state)

            value, type_ = res

@@ -428,14 +429,14 @@ class ContextualLexer(Lexer):
        try:
            while True:
                lexer = self.lexers[parser_state.position]
                yield lexer.next_token(lexer_state)
                yield lexer.next_token(lexer_state, parser_state)
        except EOFError:
            pass
        except UnexpectedCharacters as e:
            # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
            # This tests the input against the global context, to provide a nicer error.
            token = self.root_lexer.next_token(lexer_state)
            raise UnexpectedToken(token, e.allowed, state=parser_state.position)
            token = self.root_lexer.next_token(lexer_state, parser_state)
            raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token])


 class LexerThread:
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -179,9 +179,6 @@ class Earley_WithLexer(WithLexer):
        tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
        self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class)

    def make_lexer(self, text):
        return WithLexer.make_lexer(self, text).lex(None)

    def match(self, term, token):
        return term.name == token.type

--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -146,7 +146,7 @@ class Parser:
                        column.add(new_item)
                        items.append(new_item)

    def _parse(self, stream, columns, to_scan, start_symbol=None):
    def _parse(self, lexer, columns, to_scan, start_symbol=None):
        def is_quasi_complete(item):
            if item.is_complete:
                return True
@@ -245,7 +245,7 @@ class Parser:

            if not next_set and not next_to_scan:
                expect = {i.expect.name for i in to_scan}
                raise UnexpectedToken(token, expect, considered_rules = set(to_scan))
                raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))

            return next_to_scan

@@ -261,20 +261,24 @@ class Parser:
        # Completions will be added to the SPPF tree, and predictions will be recursively
        # processed down to terminals/empty nodes to be added to the scanner for the next
        # step.
        expects = {i.expect for i in to_scan}
        i = 0
        for token in stream:
        for token in lexer.lex(expects):
            self.predict_and_complete(i, to_scan, columns, transitives)

            to_scan = scan(i, token, to_scan)
            i += 1

            expects.clear()
            expects |= {i.expect for i in to_scan}

        self.predict_and_complete(i, to_scan, columns, transitives)

        ## Column is now the final column in the parse.
        assert i == len(columns)-1
        return to_scan

    def parse(self, stream, start):
    def parse(self, lexer, start):
        assert start, start
        start_symbol = NonTerminal(start)

@@ -291,7 +295,7 @@ class Parser:
            else:
                columns[0].add(item)

        to_scan = self._parse(stream, columns, to_scan, start_symbol)
        to_scan = self._parse(lexer, columns, to_scan, start_symbol)

        # If the parse was successful, the start
        # symbol should have been completed in the last step of the Earley cycle, and will be in
@@ -299,7 +303,7 @@ class Parser:
        solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
        if not solutions:
            expected_terminals = [t.expect for t in to_scan]
            raise UnexpectedEOF(expected_terminals)
            raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))

        if self.debug:
            from .earley_forest import ForestToPyDotVisitor
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -3,7 +3,7 @@
 # Author: Erez Shinan (2017)
 # Email : erezshin@gmail.com
 from copy import deepcopy, copy
 from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
 from ..exceptions import UnexpectedInput, UnexpectedToken
 from ..lexer import Token

 from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
@@ -62,6 +62,12 @@ class ParserState:
    def position(self):
        return self.state_stack[-1]

    # Necessary for match_examples() to work
    def __eq__(self, other):
        if not isinstance(other, ParserState):
            return False
        return self.position == other.position

    def __copy__(self):
        return type(self)(
            self.parse_conf,
@@ -86,7 +92,7 @@ class ParserState:
                action, arg = states[state][token.type]
            except KeyError:
                expected = {s for s in states[state].keys() if s.isupper()}
                raise UnexpectedToken(token, expected, state=state, puppet=None)
                raise UnexpectedToken(token, expected, state=self, puppet=None)

            assert arg != end_state

--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -113,7 +113,8 @@ class Parser(BaseParser):
            del delayed_matches[i+1]    # No longer needed, so unburden memory

            if not next_set and not delayed_matches and not next_to_scan:
                raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan))
                raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan},
                                           set(to_scan), state=frozenset(i.s for i in to_scan))

            return next_to_scan

--- a/lark/tree_matcher.py
+++ b/lark/tree_matcher.py
@@ -69,6 +69,14 @@ def parse_rulename(s):
    return name, args



 class ChildrenLexer:
    def __init__(self, children):
        self.children = children

    def lex(self, parser_state):
        return self.children

 class TreeMatcher:
    """Match the elements of a tree node, based on an ontology
    provided by a Lark grammar.
@@ -173,6 +181,6 @@ class TreeMatcher:
            self._parser_cache[rulename] = parser

        # find a full derivation
        unreduced_tree = parser.parse(tree.children, rulename)
        unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename)
        assert unreduced_tree.data == rulename
        return unreduced_tree
--- a/setup.py
+++ b/setup.py
@@ -29,8 +29,8 @@ setup(
    description = "a modern parsing library",
    license = "MIT",
    keywords = "Earley LALR parser parsing ast",
    url = "https://github.com/erezsh/lark",
    download_url = "https://github.com/erezsh/lark/tarball/master",
    url = "https://github.com/lark-parser/lark",
    download_url = "https://github.com/lark-parser/lark/tarball/master",
    long_description='''
 Lark is a modern general-purpose parsing library for Python.

--- a/tests/main.py
+++ b/tests/main.py
@@ -9,6 +9,7 @@ from .test_tools import TestStandalone
 from .test_cache import TestCache
 from .test_grammar import TestGrammar
 from .test_reconstructor import TestReconstructor
 from .test_tree_forest_transformer import TestTreeForestTransformer

 try:
    from .test_nearley.test_nearley import TestNearley
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -322,7 +322,7 @@ class TestParsers(unittest.TestCase):

    def test_alias(self):
        Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
        

    def test_backwards_custom_lexer(self):
        class OldCustomLexer(Lexer):
            def __init__(self, lexer_conf):
@@ -330,12 +330,12 @@ class TestParsers(unittest.TestCase):

            def lex(self, text):
                yield Token('A', 'A')
        

        p = Lark("""
        start: A
        %declare A
        """, parser='lalr', lexer=OldCustomLexer)
        

        r = p.parse('')
        self.assertEqual(r, Tree('start', [Token('A', 'A')]))

@@ -2361,6 +2361,31 @@ def _make_parser_test(LEXER, PARSER):
                self.assertEqual(a.line, 1)
                self.assertEqual(b.line, 2)

        @unittest.skipIf(PARSER=='cyk', "match_examples() not supported for CYK")
        def test_match_examples(self):
            p = _Lark(r"""
                start: "a" "b" "c"
            """)

            def match_error(s):
                try:
                    _ = p.parse(s)
                except UnexpectedInput as u:
                    return u.match_examples(p.parse, {
                        0: ['abe'],
                        1: ['ab'],
                        2: ['cbc', 'dbc'],
                    })
                assert False

            assert match_error("abe") == 0
            assert match_error("ab") == 1
            assert match_error("bbc") == 2
            assert match_error("cbc") == 2
            self.assertEqual( match_error("dbc"), 2 )
            self.assertEqual( match_error("ebc"), 2 )


        @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
        def test_unicode_class(self):
            "Tests that character classes from the `regex` module work correctly."