瀏覽代碼

Reimplementation of end symbol (Issue #237)

remotes/origin/gm/2021-09-23T00Z/github.com--lark-parser-lark/end_symbol_2021
Erez Sh 3 年之前
父節點
當前提交
51cde70929
共有 7 個文件被更改,包括 65 次插入14 次删除
  1. +1
    -0
      lark/grammar.py
  2. +9
    -1
      lark/load_grammar.py
  3. +2
    -2
      lark/parsers/grammar_analysis.py
  4. +2
    -2
      lark/parsers/lalr_analysis.py
  5. +6
    -5
      lark/parsers/lalr_interactive_parser.py
  6. +8
    -4
      lark/parsers/lalr_parser.py
  7. +37
    -0
      tests/test_parser.py

+ 1
- 0
lark/grammar.py 查看文件

@@ -1,6 +1,7 @@
from .utils import Serialize

###{standalone
END = '__$END$__'

class Symbol(Serialize):
__slots__ = ('name',)


+ 9
- 1
lark/load_grammar.py 查看文件

@@ -14,7 +14,7 @@ from .lexer import Token, TerminalDef, PatternStr, PatternRE
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import ParsingFrontend
from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END
from .utils import classify, suppress, dedup_list, Str
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError

@@ -99,6 +99,7 @@ TERMINALS = {
'_EXTEND': r'%extend',
'_IMPORT': r'%import',
'NUMBER': r'[+-]?\d+',
'_END': r'\$',
}

RULES = {
@@ -135,6 +136,7 @@ RULES = {
'nonterminal',
'literal',
'range',
'end',
'template_usage'],

'terminal': ['TERMINAL'],
@@ -144,6 +146,7 @@ RULES = {

'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOTDOT STRING'],
'end': ['_END'],

'template_usage': ['RULE _LBRACE _template_args _RBRACE'],
'_template_args': ['value',
@@ -791,6 +794,9 @@ class PrepareGrammar(Transformer_InPlace):
def nonterminal(self, name):
return name

def end(self):
return Token('TERMINAL', END)


def _find_used_symbols(tree):
assert tree.data == 'expansions'
@@ -938,6 +944,8 @@ class GrammarBuilder:
self._definitions = {}
self._ignore_names = []

self._definitions[END] = ((), Tree('expansions', []), self._check_options(END, None))

def _is_term(self, name):
# Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME`
# Only the last part is the actual name, and the rest might contain mixed case


+ 2
- 2
lark/parsers/grammar_analysis.py 查看文件

@@ -2,7 +2,7 @@ from collections import Counter, defaultdict

from ..utils import bfs, fzset, classify
from ..exceptions import GrammarError
from ..grammar import Rule, Terminal, NonTerminal
from ..grammar import Rule, Terminal, NonTerminal, END


class RulePtr(object):
@@ -125,7 +125,7 @@ class GrammarAnalyzer(object):
def __init__(self, parser_conf, debug=False):
self.debug = debug

root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)])
for start in parser_conf.start}

rules = parser_conf.rules + list(root_rules.values())


+ 2
- 2
lark/parsers/lalr_analysis.py 查看文件

@@ -12,7 +12,7 @@ from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger
from ..exceptions import GrammarError

from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet
from ..grammar import Rule
from ..grammar import Rule, END

###{standalone

@@ -177,7 +177,7 @@ class LALR_Analyzer(GrammarAnalyzer):
assert(len(root.kernel) == 1)
for rp in root.kernel:
assert(rp.index == 0)
self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
self.directly_reads[(root, rp.next)] = set([ Terminal(END) ])

for state in self.lr0_states:
seen = set()


+ 6
- 5
lark/parsers/lalr_interactive_parser.py 查看文件

@@ -4,6 +4,7 @@ from copy import copy

from .. import Token
from ..exceptions import UnexpectedToken
from ..grammar import END


class InteractiveParser(object):
@@ -21,18 +22,18 @@ class InteractiveParser(object):

Note that ``token`` has to be an instance of ``Token``.
"""
return self.parser_state.feed_token(token, token.type == '$END')
return self.parser_state.feed_token(token, token.type == END)
def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the interactive parser.
Note that this modifies the instance in place and does not feed an '$END' Token"""
Note that this modifies the instance in place and does not feed an END Token"""
for token in self.lexer_state.lex(self.parser_state):
self.parser_state.feed_token(token)
def feed_eof(self, last_token=None):
"""Feed a '$END' Token. Borrows from 'last_token' if given."""
eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else Token('$END', '', 0, 1, 1)
"""Feed a END Token. Borrows from 'last_token' if given."""
eof = Token.new_borrow_pos(END, '', last_token) if last_token is not None else Token(END, '', 0, 1, 1)
return self.feed_token(eof)


@@ -116,7 +117,7 @@ class ImmutableInteractiveParser(InteractiveParser):
def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the parser.

Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token"""
Note that this returns a new ImmutableInteractiveParser and does not feed an END Token"""
cursor = self.as_mutable()
cursor.exhaust_lexer()
return cursor.as_immutable()


+ 8
- 4
lark/parsers/lalr_parser.py 查看文件

@@ -10,6 +10,7 @@ from ..utils import Serialize
from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_interactive_parser import InteractiveParser
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
from ..grammar import END

###{standalone

@@ -60,7 +61,7 @@ class LALR_Parser(Serialize):
return e.interactive_parser.resume_parse()
except UnexpectedToken as e2:
if (isinstance(e, UnexpectedToken)
and e.token.type == e2.token.type == '$END'
and e.token.type == e2.token.type == END
and e.interactive_parser == e2.interactive_parser):
# Prevent infinite loop
raise e2
@@ -132,7 +133,7 @@ class ParserState(object):

if action is Shift:
# shift once and return
assert not is_end
# assert not is_end
state_stack.append(arg)
value_stack.append(token if token.type not in callbacks else callbacks[token.type](token))
return
@@ -178,8 +179,11 @@ class _Parser(object):
for token in state.lexer.lex(state):
state.feed_token(token)

token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(token, True)
token = Token.new_borrow_pos(END, '', token) if token else Token(END, '', 0, 1, 1)
while True:
x = state.feed_token(token, True)
if x is not None:
return x
except UnexpectedInput as e:
try:
e.interactive_parser = InteractiveParser(self, state, state.lexer)


+ 37
- 0
tests/test_parser.py 查看文件

@@ -2467,6 +2467,43 @@ def _make_parser_test(LEXER, PARSER):
s = "[0 1, 2,@, 3,,, 4, 5 6 ]$"
tree = g.parse(s, on_error=ignore_errors)

@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol(self):
grammar = """
start: a b?
a: "a" $
b: "b"
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')

@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol2(self):
grammar = """
start: (a|b)+
a: "a" ("x"|$)
b: "b"
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')

@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol3(self):
grammar = """
start: (a|b)+
a: "a" (e|"x")
b: "b"
e: $
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [Tree('e', [])])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME


Loading…
取消
儲存