Reimplementation of end symbol (Issue #237)

4 years ago · 51cde70929
--- a/lark/grammar.py
+++ b/lark/grammar.py
@@ -1,6 +1,7 @@
 from .utils import Serialize

 ###{standalone
 END = '__$END$__'

 class Symbol(Serialize):
    __slots__ = ('name',)
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -14,7 +14,7 @@ from .lexer import Token, TerminalDef, PatternStr, PatternRE
 from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import ParsingFrontend
 from .common import LexerConf, ParserConf
 from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
 from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END
 from .utils import classify, suppress, dedup_list, Str
 from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError

@@ -99,6 +99,7 @@ TERMINALS = {
    '_EXTEND': r'%extend',
    '_IMPORT': r'%import',
    'NUMBER': r'[+-]?\d+',
    '_END': r'\$',
 }

 RULES = {
@@ -135,6 +136,7 @@ RULES = {
              'nonterminal',
              'literal',
              'range',
              'end',
              'template_usage'],

    'terminal': ['TERMINAL'],
@@ -144,6 +146,7 @@ RULES = {

    'maybe': ['_LBRA expansions _RBRA'],
    'range': ['STRING _DOTDOT STRING'],
    'end': ['_END'],

    'template_usage': ['RULE _LBRACE _template_args _RBRACE'],
    '_template_args': ['value',
@@ -791,6 +794,9 @@ class PrepareGrammar(Transformer_InPlace):
    def nonterminal(self, name):
        return name

    def end(self):
        return Token('TERMINAL', END)


 def _find_used_symbols(tree):
    assert tree.data == 'expansions'
@@ -938,6 +944,8 @@ class GrammarBuilder:
        self._definitions = {}
        self._ignore_names = []

        self._definitions[END] = ((), Tree('expansions', []), self._check_options(END, None))

    def _is_term(self, name):
        # Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME`
        # Only the last part is the actual name, and the rest might contain mixed case
--- a/lark/parsers/grammar_analysis.py
+++ b/lark/parsers/grammar_analysis.py
@@ -2,7 +2,7 @@ from collections import Counter, defaultdict

 from ..utils import bfs, fzset, classify
 from ..exceptions import GrammarError
 from ..grammar import Rule, Terminal, NonTerminal
 from ..grammar import Rule, Terminal, NonTerminal, END


 class RulePtr(object):
@@ -125,7 +125,7 @@ class GrammarAnalyzer(object):
    def __init__(self, parser_conf, debug=False):
        self.debug = debug

        root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
        root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)])
                      for start in parser_conf.start}

        rules = parser_conf.rules + list(root_rules.values())
--- a/lark/parsers/lalr_analysis.py
+++ b/lark/parsers/lalr_analysis.py
@@ -12,7 +12,7 @@ from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger
 from ..exceptions import GrammarError

 from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet
 from ..grammar import Rule
 from ..grammar import Rule, END

 ###{standalone

@@ -177,7 +177,7 @@ class LALR_Analyzer(GrammarAnalyzer):
            assert(len(root.kernel) == 1)
            for rp in root.kernel:
                assert(rp.index == 0)
                self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
                self.directly_reads[(root, rp.next)] = set([ Terminal(END) ])

        for state in self.lr0_states:
            seen = set()
--- a/lark/parsers/lalr_interactive_parser.py
+++ b/lark/parsers/lalr_interactive_parser.py
@@ -4,6 +4,7 @@ from copy import copy

 from .. import Token
 from ..exceptions import UnexpectedToken
 from ..grammar import END


 class InteractiveParser(object):
@@ -21,18 +22,18 @@ class InteractiveParser(object):

        Note that ``token`` has to be an instance of ``Token``.
        """
        return self.parser_state.feed_token(token, token.type == '$END')
        return self.parser_state.feed_token(token, token.type == END)
    
    def exhaust_lexer(self):
        """Try to feed the rest of the lexer state into the interactive parser.
        
        Note that this modifies the instance in place and does not feed an '$END' Token"""
        Note that this modifies the instance in place and does not feed an END Token"""
        for token in self.lexer_state.lex(self.parser_state):
            self.parser_state.feed_token(token)
    
    def feed_eof(self, last_token=None):
        """Feed a '$END' Token. Borrows from 'last_token' if given."""
        eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else Token('$END', '', 0, 1, 1)
        """Feed a END Token. Borrows from 'last_token' if given."""
        eof = Token.new_borrow_pos(END, '', last_token) if last_token is not None else Token(END, '', 0, 1, 1)
        return self.feed_token(eof)


@@ -116,7 +117,7 @@ class ImmutableInteractiveParser(InteractiveParser):
    def exhaust_lexer(self):
        """Try to feed the rest of the lexer state into the parser.

        Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token"""
        Note that this returns a new ImmutableInteractiveParser and does not feed an END Token"""
        cursor = self.as_mutable()
        cursor.exhaust_lexer()
        return cursor.as_immutable()
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -10,6 +10,7 @@ from ..utils import Serialize
 from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
 from .lalr_interactive_parser import InteractiveParser
 from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
 from ..grammar import END

 ###{standalone

@@ -60,7 +61,7 @@ class LALR_Parser(Serialize):
                    return e.interactive_parser.resume_parse()
                except UnexpectedToken as e2:
                    if (isinstance(e, UnexpectedToken)
                        and e.token.type == e2.token.type == '$END'
                        and e.token.type == e2.token.type == END
                        and e.interactive_parser == e2.interactive_parser):
                        # Prevent infinite loop
                        raise e2
@@ -132,7 +133,7 @@ class ParserState(object):

            if action is Shift:
                # shift once and return
                assert not is_end
                # assert not is_end
                state_stack.append(arg)
                value_stack.append(token if token.type not in callbacks else callbacks[token.type](token))
                return
@@ -178,8 +179,11 @@ class _Parser(object):
            for token in state.lexer.lex(state):
                state.feed_token(token)

            token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
            return state.feed_token(token, True)
            token = Token.new_borrow_pos(END, '', token) if token else Token(END, '', 0, 1, 1)
            while True:
                x = state.feed_token(token, True)
                if x is not None:
                    return x
        except UnexpectedInput as e:
            try:
                e.interactive_parser = InteractiveParser(self, state, state.lexer)
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2467,6 +2467,43 @@ def _make_parser_test(LEXER, PARSER):
            s = "[0 1, 2,@, 3,,, 4, 5 6 ]$"
            tree = g.parse(s, on_error=ignore_errors)

        @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
        def test_end_symbol(self):
            grammar = """
                start: a b?
                a: "a" $
                b: "b"
            """
            parser = _Lark(grammar)

            self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])]))
            self.assertRaises(UnexpectedInput, parser.parse, 'ab')

        @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
        def test_end_symbol2(self):
            grammar = """
                start: (a|b)+
                a: "a" ("x"|$)
                b: "b"
            """
            parser = _Lark(grammar)

            self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [])]))
            self.assertRaises(UnexpectedInput, parser.parse, 'ab')

        @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
        def test_end_symbol3(self):
            grammar = """
                start: (a|b)+
                a: "a" (e|"x")
                b: "b"
                e: $
            """
            parser = _Lark(grammar)

            self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [Tree('e', [])])]))
            self.assertRaises(UnexpectedInput, parser.parse, 'ab')            


    _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
    _TestParser.__name__ = _NAME