@@ -1,4 +1,5 @@ | |||
from __future__ import absolute_import | |||
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||
import sys, os, pickle, hashlib | |||
from io import open | |||
@@ -9,7 +10,7 @@ from .load_grammar import load_grammar | |||
from .tree import Tree | |||
from .common import LexerConf, ParserConf | |||
from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken | |||
from .lexer import Lexer, TraditionalLexer, TerminalDef | |||
from .parse_tree_builder import ParseTreeBuilder | |||
from .parser_frontends import get_frontend, _get_lexer_callbacks | |||
from .grammar import Rule | |||
@@ -462,20 +463,32 @@ class Lark(Serialize): | |||
try: | |||
return self.parser.parse(text, start=start) | |||
except UnexpectedToken as e: | |||
except UnexpectedInput as e: | |||
if on_error is None: | |||
raise | |||
while True: | |||
if isinstance(e, UnexpectedCharacters): | |||
s = e.puppet.lexer_state.state | |||
p = s.line_ctr.char_pos | |||
if not on_error(e): | |||
raise e | |||
if isinstance(e, UnexpectedCharacters): | |||
# If user didn't change the character position, then we should | |||
if p == s.line_ctr.char_pos: | |||
s.line_ctr.feed(s.text[p:p+1]) | |||
try: | |||
return e.puppet.resume_parse() | |||
except UnexpectedToken as e2: | |||
if e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||
# Prevent infinite loop | |||
raise e2 | |||
e = e2 | |||
except UnexpectedCharacters as e2: | |||
e = e2 | |||
###} |
@@ -2,7 +2,7 @@ | |||
import re | |||
from .utils import Str, classify, get_regexp_width, Py36, Serialize | |||
from .utils import Str, classify, get_regexp_width, Py36, Serialize, suppress | |||
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken | |||
###{standalone | |||
@@ -157,6 +157,8 @@ class Token(Str): | |||
class LineCounter: | |||
__slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char' | |||
def __init__(self, newline_char): | |||
self.newline_char = newline_char | |||
self.char_pos = 0 | |||
@@ -167,7 +169,7 @@ class LineCounter: | |||
def feed(self, token, test_newline=True): | |||
"""Consume a token and calculate the new line & column. | |||
As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||
As an optional optimization, set test_newline=False if token doesn't contain a newline. | |||
""" | |||
if test_newline: | |||
newlines = token.count(self.newline_char) | |||
@@ -178,49 +180,6 @@ class LineCounter: | |||
self.char_pos += len(token) | |||
self.column = self.char_pos - self.line_start_pos + 1 | |||
class _Lex: | |||
"Built to serve both Lexer and ContextualLexer" | |||
def __init__(self, lexer, state=None): | |||
self.lexer = lexer | |||
self.state = state | |||
def lex(self, stream, newline_types, ignore_types): | |||
newline_types = frozenset(newline_types) | |||
ignore_types = frozenset(ignore_types) | |||
line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n') | |||
last_token = None | |||
while line_ctr.char_pos < len(stream): | |||
lexer = self.lexer | |||
res = lexer.match(stream, line_ctr.char_pos) | |||
if not res: | |||
allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types | |||
if not allowed: | |||
allowed = {"<END-OF-FILE>"} | |||
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) | |||
value, type_ = res | |||
if type_ not in ignore_types: | |||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
line_ctr.feed(value, type_ in newline_types) | |||
t.end_line = line_ctr.line | |||
t.end_column = line_ctr.column | |||
t.end_pos = line_ctr.char_pos | |||
if t.type in lexer.callback: | |||
t = lexer.callback[t.type](t) | |||
if not isinstance(t, Token): | |||
raise ValueError("Callbacks must return a token (returned %r)" % t) | |||
yield t | |||
last_token = t | |||
else: | |||
if type_ in lexer.callback: | |||
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
lexer.callback[type_](t2) | |||
line_ctr.feed(value, type_ in newline_types) | |||
class UnlessCallback: | |||
def __init__(self, mres): | |||
@@ -286,7 +245,6 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes) | |||
except AssertionError: # Yes, this is what Python provides us.. :/ | |||
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) | |||
# terms_from_name = {t.name: t for t in terminals[:max_size]} | |||
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | |||
terminals = terminals[max_size:] | |||
return mres | |||
@@ -308,10 +266,14 @@ class Lexer(object): | |||
"""Lexer interface | |||
Method Signatures: | |||
lex(self, stream) -> Iterator[Token] | |||
lex(self, text) -> Iterator[Token] | |||
""" | |||
lex = NotImplemented | |||
def make_lexer_state(self, text): | |||
line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n') | |||
return LexerState(text, line_ctr) | |||
class TraditionalLexer(Lexer): | |||
@@ -335,8 +297,8 @@ class TraditionalLexer(Lexer): | |||
assert set(conf.ignore) <= {t.name for t in terminals} | |||
# Init | |||
self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] | |||
self.ignore_types = list(conf.ignore) | |||
self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())) | |||
self.ignore_types = frozenset(conf.ignore) | |||
terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) | |||
self.terminals = terminals | |||
@@ -345,7 +307,6 @@ class TraditionalLexer(Lexer): | |||
self.use_bytes = conf.use_bytes | |||
self._mres = None | |||
# self.build(g_regex_flags) | |||
def _build(self): | |||
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) | |||
@@ -366,17 +327,61 @@ class TraditionalLexer(Lexer): | |||
self._build() | |||
return self._mres | |||
def match(self, stream, pos): | |||
def match(self, text, pos): | |||
for mre, type_from_index in self.mres: | |||
m = mre.match(stream, pos) | |||
m = mre.match(text, pos) | |||
if m: | |||
return m.group(0), type_from_index[m.lastindex] | |||
def lex(self, stream): | |||
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | |||
def lex(self, state, parser_state): | |||
with suppress(EOFError): | |||
while True: | |||
yield self.next_token(state) | |||
def next_token(self, lex_state): | |||
line_ctr = lex_state.line_ctr | |||
while line_ctr.char_pos < len(lex_state.text): | |||
res = self.match(lex_state.text, line_ctr.char_pos) | |||
if not res: | |||
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types | |||
if not allowed: | |||
allowed = {"<END-OF-FILE>"} | |||
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | |||
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) | |||
value, type_ = res | |||
if type_ not in self.ignore_types: | |||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
line_ctr.feed(value, type_ in self.newline_types) | |||
t.end_line = line_ctr.line | |||
t.end_column = line_ctr.column | |||
t.end_pos = line_ctr.char_pos | |||
if t.type in self.callback: | |||
t = self.callback[t.type](t) | |||
if not isinstance(t, Token): | |||
raise ValueError("Callbacks must return a token (returned %r)" % t) | |||
lex_state.last_token = t | |||
return t | |||
else: | |||
if type_ in self.callback: | |||
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
self.callback[type_](t2) | |||
line_ctr.feed(value, type_ in self.newline_types) | |||
# EOF | |||
raise EOFError(self) | |||
class LexerState: | |||
__slots__ = 'text', 'line_ctr', 'last_token' | |||
def __init__(self, text, line_ctr, last_token=None): | |||
self.text = text | |||
self.line_ctr = line_ctr | |||
self.last_token = last_token | |||
def __copy__(self): | |||
return type(self)(self.text, copy(self.line_ctr), self.last_token) | |||
class ContextualLexer(Lexer): | |||
@@ -409,25 +414,29 @@ class ContextualLexer(Lexer): | |||
assert trad_conf.tokens is terminals | |||
self.root_lexer = TraditionalLexer(trad_conf) | |||
def lex(self, stream, get_parser_state): | |||
parser_state = get_parser_state() | |||
l = _Lex(self.lexers[parser_state], parser_state) | |||
def make_lexer_state(self, text): | |||
return self.root_lexer.make_lexer_state(text) | |||
def lex(self, lexer_state, parser_state): | |||
try: | |||
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): | |||
yield x | |||
parser_state = get_parser_state() | |||
l.lexer = self.lexers[parser_state] | |||
l.state = parser_state # For debug only, no need to worry about multithreading | |||
while True: | |||
lexer = self.lexers[parser_state.position] | |||
yield lexer.next_token(lexer_state) | |||
except EOFError: | |||
pass | |||
except UnexpectedCharacters as e: | |||
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, | |||
# but not in the current context. | |||
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | |||
# This tests the input against the global context, to provide a nicer error. | |||
root_match = self.root_lexer.match(stream, e.pos_in_stream) | |||
if not root_match: | |||
raise | |||
token = self.root_lexer.next_token(lexer_state) | |||
raise UnexpectedToken(token, e.allowed, state=parser_state.position) | |||
value, type_ = root_match | |||
t = Token(type_, value, e.pos_in_stream, e.line, e.column) | |||
raise UnexpectedToken(t, e.allowed, state=e.state) | |||
class LexerThread: | |||
"A thread that ties a lexer instance and a lexer state, to be used by the parser" | |||
def __init__(self, lexer, text): | |||
self.lexer = lexer | |||
self.state = lexer.make_lexer_state(text) | |||
def lex(self, parser_state): | |||
return self.lexer.lex(self.state, parser_state) | |||
###} |
@@ -1,6 +1,6 @@ | |||
from .utils import get_regexp_width, Serialize | |||
from .parsers.grammar_analysis import GrammarAnalyzer | |||
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||
from .parsers import earley, xearley, cyk | |||
from .parsers.lalr_parser import LALR_Parser | |||
from .grammar import Rule | |||
@@ -23,12 +23,22 @@ def get_frontend(parser, lexer): | |||
elif lexer == 'contextual': | |||
return LALR_ContextualLexer | |||
elif issubclass(lexer, Lexer): | |||
class CustomLexerWrapper(Lexer): | |||
def __init__(self, lexer_conf): | |||
self.lexer = lexer(lexer_conf) | |||
def lex(self, lexer_state, parser_state): | |||
return self.lexer.lex(lexer_state.text) | |||
class LALR_CustomLexerWrapper(LALR_CustomLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
super(LALR_CustomLexerWrapper, self).__init__( | |||
lexer, lexer_conf, parser_conf, options=options) | |||
def init_lexer(self): | |||
self.lexer = lexer(self.lexer_conf) | |||
future_interface = getattr(lexer, '__future_interface__', False) | |||
if future_interface: | |||
self.lexer = lexer(self.lexer_conf) | |||
else: | |||
self.lexer = CustomLexerWrapper(self.lexer_conf) | |||
return LALR_CustomLexerWrapper | |||
else: | |||
@@ -54,7 +64,7 @@ def get_frontend(parser, lexer): | |||
class _ParserFrontend(Serialize): | |||
def _parse(self, input, start, *args): | |||
def _parse(self, start, input, *args): | |||
if start is None: | |||
start = self.start | |||
if len(start) > 1: | |||
@@ -71,6 +81,18 @@ def _get_lexer_callbacks(transformer, terminals): | |||
result[terminal.name] = callback | |||
return result | |||
class PostLexConnector: | |||
def __init__(self, lexer, postlexer): | |||
self.lexer = lexer | |||
self.postlexer = postlexer | |||
def make_lexer_state(self, text): | |||
return self.lexer.make_lexer_state(text) | |||
def lex(self, lexer_state, parser_state): | |||
i = self.lexer.lex(lexer_state, parser_state) | |||
return self.postlexer.process(i) | |||
class WithLexer(_ParserFrontend): | |||
lexer = None | |||
@@ -106,13 +128,14 @@ class WithLexer(_ParserFrontend): | |||
def _serialize(self, data, memo): | |||
data['parser'] = data['parser'].serialize(memo) | |||
def lex(self, *args): | |||
stream = self.lexer.lex(*args) | |||
return self.postlex.process(stream) if self.postlex else stream | |||
def make_lexer(self, text): | |||
lexer = self.lexer | |||
if self.postlex: | |||
lexer = PostLexConnector(self.lexer, self.postlex) | |||
return LexerThread(lexer, text) | |||
def parse(self, text, start=None): | |||
token_stream = self.lex(text) | |||
return self._parse(token_stream, start) | |||
return self._parse(start, self.make_lexer(text)) | |||
def init_traditional_lexer(self): | |||
self.lexer = TraditionalLexer(self.lexer_conf) | |||
@@ -138,14 +161,6 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||
always_accept = self.postlex.always_accept if self.postlex else () | |||
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) | |||
def parse(self, text, start=None): | |||
parser_state = [None] | |||
def set_parser_state(s): | |||
parser_state[0] = s | |||
token_stream = self.lex(text, lambda: parser_state[0]) | |||
return self._parse(token_stream, start, set_parser_state) | |||
###} | |||
class LALR_CustomLexer(LALR_WithLexer): | |||
@@ -156,15 +171,6 @@ class LALR_CustomLexer(LALR_WithLexer): | |||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
def tokenize_text(text): | |||
line = 1 | |||
col_start_pos = 0 | |||
for i, ch in enumerate(text): | |||
if '\n' in ch: | |||
line += ch.count('\n') | |||
col_start_pos = i + ch.rindex('\n') | |||
yield Token('CHAR', ch, line=line, column=i - col_start_pos) | |||
class Earley(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
@@ -175,6 +181,9 @@ class Earley(WithLexer): | |||
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None | |||
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) | |||
def make_lexer(self, text): | |||
return WithLexer.make_lexer(self, text).lex(None) | |||
def match(self, term, token): | |||
return term.name == token.type | |||
@@ -219,7 +228,7 @@ class XEarley(_ParserFrontend): | |||
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | |||
def parse(self, text, start): | |||
return self._parse(text, start) | |||
return self._parse(start, text) | |||
class XEarley_CompleteLex(XEarley): | |||
def __init__(self, *args, **kw): | |||
@@ -239,8 +248,8 @@ class CYK(WithLexer): | |||
self.callbacks = parser_conf.callbacks | |||
def parse(self, text, start): | |||
tokens = list(self.lex(text)) | |||
parse = self._parse(tokens, start) | |||
tokens = list(self.make_lexer(text).lex(None)) | |||
parse = self._parse(start, tokens) | |||
parse = self._transform(parse) | |||
return parse | |||
@@ -2,9 +2,9 @@ | |||
""" | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
from ..exceptions import UnexpectedToken | |||
from copy import deepcopy, copy | |||
from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||
from ..lexer import Token | |||
from ..utils import Enumerator, Serialize | |||
from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | |||
from .lalr_puppet import ParserPuppet | |||
@@ -35,84 +35,123 @@ class LALR_Parser(object): | |||
return self.parser.parse(*args) | |||
class _Parser: | |||
def __init__(self, parse_table, callbacks, debug=False): | |||
class ParseConf: | |||
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | |||
def __init__(self, parse_table, callbacks, start): | |||
self.parse_table = parse_table | |||
self.start_state = self.parse_table.start_states[start] | |||
self.end_state = self.parse_table.end_states[start] | |||
self.states = self.parse_table.states | |||
self.callbacks = callbacks | |||
self.debug = debug | |||
self.start = start | |||
def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None): | |||
token = None | |||
stream = iter(seq) | |||
states = self.parse_table.states | |||
start_state = self.parse_table.start_states[start] | |||
end_state = self.parse_table.end_states[start] | |||
state_stack = state_stack or [start_state] | |||
value_stack = value_stack or [] | |||
class ParserState: | |||
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | |||
if set_state: set_state(start_state) | |||
def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | |||
self.parse_conf = parse_conf | |||
self.lexer = lexer | |||
self.state_stack = state_stack or [self.parse_conf.start_state] | |||
self.value_stack = value_stack or [] | |||
def get_action(token): | |||
@property | |||
def position(self): | |||
return self.state_stack[-1] | |||
def __copy__(self): | |||
return type(self)( | |||
self.parse_conf, | |||
self.lexer, # XXX copy | |||
copy(self.state_stack), | |||
deepcopy(self.value_stack), | |||
) | |||
def copy(self): | |||
return copy(self) | |||
def feed_token(self, token, is_end=False): | |||
state_stack = self.state_stack | |||
value_stack = self.value_stack | |||
states = self.parse_conf.states | |||
end_state = self.parse_conf.end_state | |||
callbacks = self.parse_conf.callbacks | |||
while True: | |||
state = state_stack[-1] | |||
try: | |||
return states[state][token.type] | |||
action, arg = states[state][token.type] | |||
except KeyError: | |||
expected = {s for s in states[state].keys() if s.isupper()} | |||
try: | |||
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | |||
except NameError: # For standalone parser | |||
puppet = None | |||
raise UnexpectedToken(token, expected, state=state, puppet=puppet) | |||
def reduce(rule): | |||
size = len(rule.expansion) | |||
if size: | |||
s = value_stack[-size:] | |||
del state_stack[-size:] | |||
del value_stack[-size:] | |||
raise UnexpectedToken(token, expected, state=state, puppet=None) | |||
assert arg != end_state | |||
if action is Shift: | |||
# shift once and return | |||
assert not is_end | |||
state_stack.append(arg) | |||
value_stack.append(token) | |||
return arg | |||
else: | |||
s = [] | |||
# reduce+shift as many times as necessary | |||
rule = arg | |||
size = len(rule.expansion) | |||
if size: | |||
s = value_stack[-size:] | |||
del state_stack[-size:] | |||
del value_stack[-size:] | |||
else: | |||
s = [] | |||
value = callbacks[rule](s) | |||
_action, new_state = states[state_stack[-1]][rule.origin.name] | |||
assert _action is Shift | |||
state_stack.append(new_state) | |||
value_stack.append(value) | |||
if is_end and state_stack[-1] == end_state: | |||
return value_stack[-1] | |||
value = self.callbacks[rule](s) | |||
class _Parser: | |||
def __init__(self, parse_table, callbacks, debug=False): | |||
self.parse_table = parse_table | |||
self.callbacks = callbacks | |||
self.debug = debug | |||
_action, new_state = states[state_stack[-1]][rule.origin.name] | |||
assert _action is Shift | |||
state_stack.append(new_state) | |||
value_stack.append(value) | |||
def parse(self, lexer, start, value_stack=None, state_stack=None): | |||
parse_conf = ParseConf(self.parse_table, self.callbacks, start) | |||
parser_state = ParserState(parse_conf, lexer, state_stack, value_stack) | |||
return self.parse_from_state(parser_state) | |||
def parse_from_state(self, state): | |||
# Main LALR-parser loop | |||
try: | |||
for token in stream: | |||
while True: | |||
action, arg = get_action(token) | |||
assert arg != end_state | |||
if action is Shift: | |||
state_stack.append(arg) | |||
value_stack.append(token) | |||
if set_state: set_state(arg) | |||
break # next token | |||
else: | |||
reduce(arg) | |||
token = None | |||
for token in state.lexer.lex(state): | |||
state.feed_token(token) | |||
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) | |||
return state.feed_token(token, True) | |||
except UnexpectedInput as e: | |||
try: | |||
e.puppet = ParserPuppet(self, state, state.lexer) | |||
except NameError: | |||
pass | |||
raise e | |||
except Exception as e: | |||
if self.debug: | |||
print("") | |||
print("STATE STACK DUMP") | |||
print("----------------") | |||
for i, s in enumerate(state_stack): | |||
for i, s in enumerate(state.state_stack): | |||
print('%d)' % i , s) | |||
print("") | |||
raise | |||
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) | |||
while True: | |||
_action, arg = get_action(token) | |||
assert(_action is Reduce) | |||
reduce(arg) | |||
if state_stack[-1] == end_state: | |||
return value_stack[-1] | |||
###} | |||
@@ -1,10 +1,10 @@ | |||
# This module provide a LALR puppet, which is used to debugging and error handling | |||
from copy import deepcopy | |||
from copy import copy | |||
from .lalr_analysis import Shift, Reduce | |||
from .. import Token | |||
from ..exceptions import ParseError | |||
from ..exceptions import UnexpectedToken | |||
class ParserPuppet(object): | |||
@@ -12,96 +12,45 @@ class ParserPuppet(object): | |||
For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``. | |||
""" | |||
def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | |||
def __init__(self, parser, parser_state, lexer_state): | |||
self.parser = parser | |||
self._state_stack = state_stack | |||
self._value_stack = value_stack | |||
self._start = start | |||
self._stream = stream | |||
self._set_state = set_state | |||
self.result = None | |||
self.parser_state = parser_state | |||
self.lexer_state = lexer_state | |||
def feed_token(self, token): | |||
"""Feed the parser with a token, and advance it to the next state, as if it received it from the lexer. | |||
Note that ``token`` has to be an instance of ``Token``. | |||
""" | |||
end_state = self.parser.parse_table.end_states[self._start] | |||
state_stack = self._state_stack | |||
value_stack = self._value_stack | |||
state = state_stack[-1] | |||
action, arg = self.parser.parse_table.states[state][token.type] | |||
if arg == end_state: | |||
raise ParseError(arg) | |||
while action is Reduce: | |||
rule = arg | |||
size = len(rule.expansion) | |||
if size: | |||
s = value_stack[-size:] | |||
del state_stack[-size:] | |||
del value_stack[-size:] | |||
else: | |||
s = [] | |||
value = self.parser.callbacks[rule](s) | |||
_action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name] | |||
assert _action is Shift | |||
state_stack.append(new_state) | |||
value_stack.append(value) | |||
if state_stack[-1] == end_state: | |||
self.result = value_stack[-1] | |||
return self.result | |||
state = state_stack[-1] | |||
try: | |||
action, arg = self.parser.parse_table.states[state][token.type] | |||
except KeyError as e: | |||
raise ParseError(e) | |||
assert arg != end_state | |||
assert action is Shift | |||
state_stack.append(arg) | |||
value_stack.append(token) | |||
def copy(self): | |||
return self.parser_state.feed_token(token) | |||
def __copy__(self): | |||
"""Create a new puppet with a separate state. | |||
Calls to feed_token() won't affect the old puppet, and vice-versa. | |||
""" | |||
return type(self)( | |||
self.parser, | |||
list(self._state_stack), | |||
deepcopy(self._value_stack), | |||
self._start, | |||
self._stream, | |||
self._set_state, | |||
copy(self.parser_state), | |||
copy(self.lexer_state), | |||
) | |||
def __eq__(self, other): | |||
if not isinstance(other, ParserPuppet): | |||
return False | |||
return ( | |||
self._state_stack == other._state_stack and | |||
self._value_stack == other._value_stack and | |||
self._stream == other._stream and | |||
self._start == other._start | |||
) | |||
return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state | |||
def __hash__(self): | |||
return hash((tuple(self._state_stack), self._start)) | |||
# TODO Provide with an immutable puppet instance | |||
# def __hash__(self): | |||
# return hash((self.parser_state, self.lexer_state)) | |||
def pretty(self): | |||
"""Print the output of ``choices()`` in a way that's easier to read.""" | |||
out = ["Puppet choices:"] | |||
for k, v in self.choices().items(): | |||
out.append('\t- %s -> %s' % (k, v)) | |||
out.append('stack size: %s' % len(self._state_stack)) | |||
out.append('stack size: %s' % len(self.parser_state.state_stack)) | |||
return '\n'.join(out) | |||
def choices(self): | |||
@@ -111,16 +60,16 @@ class ParserPuppet(object): | |||
Updated by ``feed_token()``. | |||
""" | |||
return self.parser.parse_table.states[self._state_stack[-1]] | |||
return self.parser_state.parse_table.states[self.parser_state.position] | |||
def accepts(self): | |||
accepts = set() | |||
for t in self.choices(): | |||
if t.isupper(): # is terminal? | |||
new_puppet = self.copy() | |||
new_puppet = copy(self) | |||
try: | |||
new_puppet.feed_token(Token(t, '')) | |||
except ParseError: | |||
except UnexpectedToken: | |||
pass | |||
else: | |||
accepts.add(t) | |||
@@ -128,7 +77,4 @@ class ParserPuppet(object): | |||
def resume_parse(self): | |||
"""Resume parsing from the current puppet state.""" | |||
return self.parser.parse( | |||
self._stream, self._start, self._set_state, | |||
self._value_stack, self._state_stack | |||
) | |||
return self.parser.parse_from_state(self.parser_state) |
@@ -2217,6 +2217,42 @@ def _make_parser_test(LEXER, PARSER): | |||
""", regex=True) | |||
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||
@unittest.skipIf(PARSER!='lalr', "Puppet error handling only works with LALR for now") | |||
def test_error_with_puppet(self): | |||
def ignore_errors(e): | |||
if isinstance(e, UnexpectedCharacters): | |||
# Skip bad character | |||
return True | |||
# Must be UnexpectedToken | |||
if e.token.type == 'COMMA': | |||
# Skip comma | |||
return True | |||
elif e.token.type == 'SIGNED_NUMBER': | |||
# Try to feed a comma and retry the number | |||
e.puppet.feed_token(Token('COMMA', ',')) | |||
e.puppet.feed_token(e.token) | |||
return True | |||
# Unhandled error. Will stop parse and raise exception | |||
return False | |||
g = _Lark(r''' | |||
start: "[" num ("," num)* "]" | |||
?num: SIGNED_NUMBER | |||
%import common.SIGNED_NUMBER | |||
%ignore " " | |||
''') | |||
s = "[0 1, 2,, 3,,, 4, 5 6 ]" | |||
tree = g.parse(s, on_error=ignore_errors) | |||
res = [int(x) for x in tree.children] | |||
assert res == list(range(7)) | |||
s = "[0 1, 2,@, 3,,, 4, 5 6 ]$" | |||
tree = g.parse(s, on_error=ignore_errors) | |||
_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | |||
_TestParser.__name__ = _NAME | |||
_TestParser.__qualname__ = "tests.test_parser." + _NAME | |||