diff --git a/lark/grammar.py b/lark/grammar.py index 14893fb..730b912 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,6 +1,7 @@ from .utils import Serialize ###{standalone +END = '_END$' class Symbol(Serialize): is_term = NotImplemented diff --git a/lark/lexer.py b/lark/lexer.py index 3e881f8..e12195f 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -292,7 +292,7 @@ class TraditionalLexer(Lexer): if t.pattern.min_width == 0: raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) - assert set(ignore) <= {t.name for t in terminals} + assert set(ignore) <= {t.name for t in terminals}, (ignore, terminals) # Init self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] diff --git a/lark/load_grammar.py b/lark/load_grammar.py index f7b1011..8d50e0a 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -11,7 +11,7 @@ from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf -from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol +from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END from .utils import classify, suppress, dedup_list from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken @@ -91,6 +91,7 @@ TERMINALS = { '_DECLARE': r'%declare', '_IMPORT': r'%import', 'NUMBER': r'\d+', + '_END': r'\$', } RULES = { @@ -122,7 +123,8 @@ RULES = { 'value': ['terminal', 'nonterminal', 'literal', - 'range'], + 'range', + 'end'], 'terminal': ['TERMINAL'], 'nonterminal': ['RULE'], @@ -131,6 +133,7 @@ RULES = { 'maybe': ['_LBRA expansions _RBRA'], 'range': ['STRING _DOT _DOT STRING'], + 'end': ['_END'], 'term': ['TERMINAL _COLON expansions _NL', 'TERMINAL _DOT NUMBER _COLON expansions _NL'], @@ -285,6 +288,9 @@ class CanonizeTree(Transformer_InPlace): tokenmods, value = args return tokenmods + [value] + def end(self): + return Token('TERMINAL', END) + class PrepareAnonTerminals(Transformer_InPlace): "Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them" @@ -735,6 +741,7 @@ class GrammarLoader: term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs] term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs] + term_defs.append((END, (None, 0))) rule_defs = [options_from_rule(*x) for x in rule_defs] # Execute statements @@ -827,7 +834,7 @@ class GrammarLoader: raise GrammarError("Terminal '%s' defined more than once" % name) terminal_names.add(name) - if set(ignore_names) > terminal_names: + if set(ignore_names) - terminal_names: raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names)) resolve_term_references(term_defs) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 086349c..803b935 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -2,7 +2,7 @@ from collections import Counter from ..utils import bfs, fzset, classify from ..exceptions import GrammarError -from ..grammar import Rule, Terminal, NonTerminal +from ..grammar import Rule, Terminal, NonTerminal, END class RulePtr(object): @@ -109,7 +109,7 @@ class GrammarAnalyzer(object): def __init__(self, parser_conf, debug=False): self.debug = debug - root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')]) + root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)]) for start in parser_conf.start} rules = parser_conf.rules + list(root_rules.values()) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index eef1f9b..9c02ca2 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -13,7 +13,7 @@ from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal -from ..grammar import Rule +from ..grammar import Rule, END ###{standalone diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 39dd5f3..7444a74 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -5,6 +5,7 @@ from ..exceptions import UnexpectedToken from ..lexer import Token from ..utils import Enumerator, Serialize +from ..grammar import END from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable @@ -94,13 +95,14 @@ class _Parser: else: reduce(arg) - token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) + token = Token.new_borrow_pos(END, None, token) if token else Token(END, None, 0, 1, 1) while True: _action, arg = get_action(token) if _action is Shift: - assert arg == end_state - val ,= value_stack - return val + if arg == end_state: + val ,= value_stack + return val + state_stack.append(arg) else: reduce(arg) diff --git a/tests/test_parser.py b/tests/test_parser.py index 3238ead..9a902b8 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1505,6 +1505,18 @@ def _make_parser_test(LEXER, PARSER): """ parser = _Lark(grammar) + @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only") + def test_end_symbol(self): + grammar = """ + start: a b? + a: "a" $ + b: "b" + """ + parser = _Lark(grammar) + + self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])])) + self.assertRaises(UnexpectedInput, parser.parse, 'ab') + @unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)") def test_serialize(self):