Merge branch 'end_symbol3'

5 years ago · d4588ae538
--- a/lark/grammar.py
+++ b/lark/grammar.py
@@ -1,6 +1,7 @@
 from .utils import Serialize

 ###{standalone
 END = '_END$'

 class Symbol(Serialize):
    __slots__ = ('name',)
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -309,7 +309,7 @@ class TraditionalLexer(Lexer):
            if t.pattern.min_width == 0:
                raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))

        assert set(ignore) <= {t.name for t in terminals}
        assert set(ignore) <= {t.name for t in terminals}, (ignore, terminals)

        # Init
        self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -11,7 +11,7 @@ from .lexer import Token, TerminalDef, PatternStr, PatternRE
 from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import LALR_TraditionalLexer
 from .common import LexerConf, ParserConf
 from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
 from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END
 from .utils import classify, suppress, dedup_list, Str
 from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken

@@ -94,6 +94,7 @@ TERMINALS = {
    '_DECLARE': r'%declare',
    '_IMPORT': r'%import',
    'NUMBER': r'[+-]?\d+',
    '_END': r'\$',
 }

 RULES = {
@@ -130,7 +131,8 @@ RULES = {
              'nonterminal',
              'literal',
              'range',
              'template_usage'],
              'template_usage',
              'end'],

    'terminal': ['TERMINAL'],
    'nonterminal': ['RULE'],
@@ -139,6 +141,7 @@ RULES = {

    'maybe': ['_LBRA expansions _RBRA'],
    'range': ['STRING _DOTDOT STRING'],
    'end': ['_END'],

    'template_usage': ['RULE _LBRACE _template_args _RBRACE'],
    '_template_args': ['value',
@@ -299,6 +302,9 @@ class CanonizeTree(Transformer_InPlace):
        tokenmods, value = args
        return tokenmods + [value]

    def end(self):
        return Token('TERMINAL', END)

 class PrepareAnonTerminals(Transformer_InPlace):
    "Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them"

@@ -807,6 +813,7 @@ class GrammarLoader:

        term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs]
        term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs]
        term_defs.append((END, (None, 0)))
        rule_defs = [options_from_rule(*x) for x in rule_defs]

        # Execute statements
@@ -899,7 +906,7 @@ class GrammarLoader:
                raise GrammarError("Terminal '%s' defined more than once" % name)
            terminal_names.add(name)

        if set(ignore_names) > terminal_names:
        if set(ignore_names) - terminal_names:
            raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names))

        resolve_term_references(term_defs)
--- a/lark/parsers/grammar_analysis.py
+++ b/lark/parsers/grammar_analysis.py
@@ -2,7 +2,7 @@ from collections import Counter, defaultdict

 from ..utils import bfs, fzset, classify
 from ..exceptions import GrammarError
 from ..grammar import Rule, Terminal, NonTerminal
 from ..grammar import Rule, Terminal, NonTerminal, END


 class RulePtr(object):
@@ -125,7 +125,7 @@ class GrammarAnalyzer(object):
    def __init__(self, parser_conf, debug=False):
        self.debug = debug

        root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
        root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)])
                      for start in parser_conf.start}

        rules = parser_conf.rules + list(root_rules.values())
--- a/lark/parsers/lalr_analysis.py
+++ b/lark/parsers/lalr_analysis.py
@@ -13,7 +13,7 @@ from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
 from ..exceptions import GrammarError

 from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet
 from ..grammar import Rule
 from ..grammar import Rule, END

 ###{standalone

@@ -178,7 +178,7 @@ class LALR_Analyzer(GrammarAnalyzer):
            assert(len(root.kernel) == 1)
            for rp in root.kernel:
                assert(rp.index == 0)
                self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
                self.directly_reads[(root, rp.next)] = set([ Terminal(END) ])

        for state in self.lr0_states:
            seen = set()
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -5,6 +5,7 @@
 from ..exceptions import UnexpectedToken
 from ..lexer import Token
 from ..utils import Enumerator, Serialize
 from ..grammar import END

 from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable

@@ -105,12 +106,16 @@ class _Parser:

            raise

        token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
        token = Token.new_borrow_pos(END, None, token) if token else Token(END, None, 0, 1, 1)
        while True:
            _action, arg = get_action(token)
            assert(_action is Reduce)
            reduce(arg)
            if state_stack[-1] == end_state:
                return value_stack[-1]
            if _action is Shift:
                state_stack.append(arg)
                value_stack.append(token)
            else:
                assert(_action is Reduce)
                reduce(arg)
                if state_stack[-1] == end_state:
                    return value_stack[-1]

 ###}
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1737,6 +1737,42 @@ def _make_parser_test(LEXER, PARSER):
            """
            parser = _Lark(grammar)

        @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
        def test_end_symbol(self):
            grammar = """
                start: a b?
                a: "a" $
                b: "b"
            """
            parser = _Lark(grammar)

            self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])]))
            self.assertRaises(UnexpectedInput, parser.parse, 'ab')

        @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
        def test_end_symbol2(self):
            grammar = """
                start: (a|b)+
                a: "a" ("x"|$)
                b: "b"
            """
            parser = _Lark(grammar)

            self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [])]))
            self.assertRaises(UnexpectedInput, parser.parse, 'ab')

        @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
        def test_end_symbol3(self):
            grammar = """
                start: (a|b)+
                a: "a" (e|"x")
                b: "b"
                e: $
            """
            parser = _Lark(grammar)

            self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [Tree('e', [])])]))
            self.assertRaises(UnexpectedInput, parser.parse, 'ab')

        @unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)")
        def test_serialize(self):