Browse Source

Reimplementation of end symbol (Issue #237)

remotes/origin/gm/2021-09-23T00Z/github.com--lark-parser-lark/end_symbol_2021
Erez Sh 4 years ago
parent
commit
51cde70929
7 changed files with 65 additions and 14 deletions
  1. +1
    -0
      lark/grammar.py
  2. +9
    -1
      lark/load_grammar.py
  3. +2
    -2
      lark/parsers/grammar_analysis.py
  4. +2
    -2
      lark/parsers/lalr_analysis.py
  5. +6
    -5
      lark/parsers/lalr_interactive_parser.py
  6. +8
    -4
      lark/parsers/lalr_parser.py
  7. +37
    -0
      tests/test_parser.py

+ 1
- 0
lark/grammar.py View File

@@ -1,6 +1,7 @@
from .utils import Serialize from .utils import Serialize


###{standalone ###{standalone
END = '__$END$__'


class Symbol(Serialize): class Symbol(Serialize):
__slots__ = ('name',) __slots__ = ('name',)


+ 9
- 1
lark/load_grammar.py View File

@@ -14,7 +14,7 @@ from .lexer import Token, TerminalDef, PatternStr, PatternRE
from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import ParsingFrontend from .parser_frontends import ParsingFrontend
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END
from .utils import classify, suppress, dedup_list, Str from .utils import classify, suppress, dedup_list, Str
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError


@@ -99,6 +99,7 @@ TERMINALS = {
'_EXTEND': r'%extend', '_EXTEND': r'%extend',
'_IMPORT': r'%import', '_IMPORT': r'%import',
'NUMBER': r'[+-]?\d+', 'NUMBER': r'[+-]?\d+',
'_END': r'\$',
} }


RULES = { RULES = {
@@ -135,6 +136,7 @@ RULES = {
'nonterminal', 'nonterminal',
'literal', 'literal',
'range', 'range',
'end',
'template_usage'], 'template_usage'],


'terminal': ['TERMINAL'], 'terminal': ['TERMINAL'],
@@ -144,6 +146,7 @@ RULES = {


'maybe': ['_LBRA expansions _RBRA'], 'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOTDOT STRING'], 'range': ['STRING _DOTDOT STRING'],
'end': ['_END'],


'template_usage': ['RULE _LBRACE _template_args _RBRACE'], 'template_usage': ['RULE _LBRACE _template_args _RBRACE'],
'_template_args': ['value', '_template_args': ['value',
@@ -791,6 +794,9 @@ class PrepareGrammar(Transformer_InPlace):
def nonterminal(self, name): def nonterminal(self, name):
return name return name


def end(self):
return Token('TERMINAL', END)



def _find_used_symbols(tree): def _find_used_symbols(tree):
assert tree.data == 'expansions' assert tree.data == 'expansions'
@@ -938,6 +944,8 @@ class GrammarBuilder:
self._definitions = {} self._definitions = {}
self._ignore_names = [] self._ignore_names = []


self._definitions[END] = ((), Tree('expansions', []), self._check_options(END, None))

def _is_term(self, name): def _is_term(self, name):
# Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME` # Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME`
# Only the last part is the actual name, and the rest might contain mixed case # Only the last part is the actual name, and the rest might contain mixed case


+ 2
- 2
lark/parsers/grammar_analysis.py View File

@@ -2,7 +2,7 @@ from collections import Counter, defaultdict


from ..utils import bfs, fzset, classify from ..utils import bfs, fzset, classify
from ..exceptions import GrammarError from ..exceptions import GrammarError
from ..grammar import Rule, Terminal, NonTerminal
from ..grammar import Rule, Terminal, NonTerminal, END




class RulePtr(object): class RulePtr(object):
@@ -125,7 +125,7 @@ class GrammarAnalyzer(object):
def __init__(self, parser_conf, debug=False): def __init__(self, parser_conf, debug=False):
self.debug = debug self.debug = debug


root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)])
for start in parser_conf.start} for start in parser_conf.start}


rules = parser_conf.rules + list(root_rules.values()) rules = parser_conf.rules + list(root_rules.values())


+ 2
- 2
lark/parsers/lalr_analysis.py View File

@@ -12,7 +12,7 @@ from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger
from ..exceptions import GrammarError from ..exceptions import GrammarError


from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet
from ..grammar import Rule
from ..grammar import Rule, END


###{standalone ###{standalone


@@ -177,7 +177,7 @@ class LALR_Analyzer(GrammarAnalyzer):
assert(len(root.kernel) == 1) assert(len(root.kernel) == 1)
for rp in root.kernel: for rp in root.kernel:
assert(rp.index == 0) assert(rp.index == 0)
self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
self.directly_reads[(root, rp.next)] = set([ Terminal(END) ])


for state in self.lr0_states: for state in self.lr0_states:
seen = set() seen = set()


+ 6
- 5
lark/parsers/lalr_interactive_parser.py View File

@@ -4,6 +4,7 @@ from copy import copy


from .. import Token from .. import Token
from ..exceptions import UnexpectedToken from ..exceptions import UnexpectedToken
from ..grammar import END




class InteractiveParser(object): class InteractiveParser(object):
@@ -21,18 +22,18 @@ class InteractiveParser(object):


Note that ``token`` has to be an instance of ``Token``. Note that ``token`` has to be an instance of ``Token``.
""" """
return self.parser_state.feed_token(token, token.type == '$END')
return self.parser_state.feed_token(token, token.type == END)
def exhaust_lexer(self): def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the interactive parser. """Try to feed the rest of the lexer state into the interactive parser.
Note that this modifies the instance in place and does not feed an '$END' Token"""
Note that this modifies the instance in place and does not feed an END Token"""
for token in self.lexer_state.lex(self.parser_state): for token in self.lexer_state.lex(self.parser_state):
self.parser_state.feed_token(token) self.parser_state.feed_token(token)
def feed_eof(self, last_token=None): def feed_eof(self, last_token=None):
"""Feed a '$END' Token. Borrows from 'last_token' if given."""
eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else Token('$END', '', 0, 1, 1)
"""Feed a END Token. Borrows from 'last_token' if given."""
eof = Token.new_borrow_pos(END, '', last_token) if last_token is not None else Token(END, '', 0, 1, 1)
return self.feed_token(eof) return self.feed_token(eof)




@@ -116,7 +117,7 @@ class ImmutableInteractiveParser(InteractiveParser):
def exhaust_lexer(self): def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the parser. """Try to feed the rest of the lexer state into the parser.


Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token"""
Note that this returns a new ImmutableInteractiveParser and does not feed an END Token"""
cursor = self.as_mutable() cursor = self.as_mutable()
cursor.exhaust_lexer() cursor.exhaust_lexer()
return cursor.as_immutable() return cursor.as_immutable()


+ 8
- 4
lark/parsers/lalr_parser.py View File

@@ -10,6 +10,7 @@ from ..utils import Serialize
from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_interactive_parser import InteractiveParser from .lalr_interactive_parser import InteractiveParser
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
from ..grammar import END


###{standalone ###{standalone


@@ -60,7 +61,7 @@ class LALR_Parser(Serialize):
return e.interactive_parser.resume_parse() return e.interactive_parser.resume_parse()
except UnexpectedToken as e2: except UnexpectedToken as e2:
if (isinstance(e, UnexpectedToken) if (isinstance(e, UnexpectedToken)
and e.token.type == e2.token.type == '$END'
and e.token.type == e2.token.type == END
and e.interactive_parser == e2.interactive_parser): and e.interactive_parser == e2.interactive_parser):
# Prevent infinite loop # Prevent infinite loop
raise e2 raise e2
@@ -132,7 +133,7 @@ class ParserState(object):


if action is Shift: if action is Shift:
# shift once and return # shift once and return
assert not is_end
# assert not is_end
state_stack.append(arg) state_stack.append(arg)
value_stack.append(token if token.type not in callbacks else callbacks[token.type](token)) value_stack.append(token if token.type not in callbacks else callbacks[token.type](token))
return return
@@ -178,8 +179,11 @@ class _Parser(object):
for token in state.lexer.lex(state): for token in state.lexer.lex(state):
state.feed_token(token) state.feed_token(token)


token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(token, True)
token = Token.new_borrow_pos(END, '', token) if token else Token(END, '', 0, 1, 1)
while True:
x = state.feed_token(token, True)
if x is not None:
return x
except UnexpectedInput as e: except UnexpectedInput as e:
try: try:
e.interactive_parser = InteractiveParser(self, state, state.lexer) e.interactive_parser = InteractiveParser(self, state, state.lexer)


+ 37
- 0
tests/test_parser.py View File

@@ -2467,6 +2467,43 @@ def _make_parser_test(LEXER, PARSER):
s = "[0 1, 2,@, 3,,, 4, 5 6 ]$" s = "[0 1, 2,@, 3,,, 4, 5 6 ]$"
tree = g.parse(s, on_error=ignore_errors) tree = g.parse(s, on_error=ignore_errors)


@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol(self):
grammar = """
start: a b?
a: "a" $
b: "b"
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')

@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol2(self):
grammar = """
start: (a|b)+
a: "a" ("x"|$)
b: "b"
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')

@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol3(self):
grammar = """
start: (a|b)+
a: "a" (e|"x")
b: "b"
e: $
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [Tree('e', [])])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')



_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME _TestParser.__name__ = _NAME


Loading…
Cancel
Save