@@ -1,4 +1,5 @@ | |||||
from __future__ import absolute_import | from __future__ import absolute_import | ||||
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||||
import sys, os, pickle, hashlib | import sys, os, pickle, hashlib | ||||
from io import open | from io import open | ||||
@@ -9,7 +10,7 @@ from .load_grammar import load_grammar | |||||
from .tree import Tree | from .tree import Tree | ||||
from .common import LexerConf, ParserConf | from .common import LexerConf, ParserConf | ||||
from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken | |||||
from .lexer import Lexer, TraditionalLexer, TerminalDef | |||||
from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
from .parser_frontends import get_frontend, _get_lexer_callbacks | from .parser_frontends import get_frontend, _get_lexer_callbacks | ||||
from .grammar import Rule | from .grammar import Rule | ||||
@@ -462,7 +463,7 @@ class Lark(Serialize): | |||||
try: | try: | ||||
return self.parser.parse(text, start=start) | return self.parser.parse(text, start=start) | ||||
except UnexpectedToken as e: | |||||
except UnexpectedInput as e: | |||||
if on_error is None: | if on_error is None: | ||||
raise | raise | ||||
@@ -472,10 +473,12 @@ class Lark(Serialize): | |||||
try: | try: | ||||
return e.puppet.resume_parse() | return e.puppet.resume_parse() | ||||
except UnexpectedToken as e2: | except UnexpectedToken as e2: | ||||
if e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||||
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||||
# Prevent infinite loop | # Prevent infinite loop | ||||
raise e2 | raise e2 | ||||
e = e2 | e = e2 | ||||
except UnexpectedCharacters as e2: | |||||
e = e2 | |||||
###} | ###} |
@@ -157,6 +157,8 @@ class Token(Str): | |||||
class LineCounter: | class LineCounter: | ||||
__slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char' | |||||
def __init__(self, newline_char): | def __init__(self, newline_char): | ||||
self.newline_char = newline_char | self.newline_char = newline_char | ||||
self.char_pos = 0 | self.char_pos = 0 | ||||
@@ -167,7 +169,7 @@ class LineCounter: | |||||
def feed(self, token, test_newline=True): | def feed(self, token, test_newline=True): | ||||
"""Consume a token and calculate the new line & column. | """Consume a token and calculate the new line & column. | ||||
As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||||
As an optional optimization, set test_newline=False if token doesn't contain a newline. | |||||
""" | """ | ||||
if test_newline: | if test_newline: | ||||
newlines = token.count(self.newline_char) | newlines = token.count(self.newline_char) | ||||
@@ -243,7 +245,6 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes) | |||||
except AssertionError: # Yes, this is what Python provides us.. :/ | except AssertionError: # Yes, this is what Python provides us.. :/ | ||||
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) | return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) | ||||
# terms_from_name = {t.name: t for t in terminals[:max_size]} | |||||
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | ||||
terminals = terminals[max_size:] | terminals = terminals[max_size:] | ||||
return mres | return mres | ||||
@@ -269,6 +270,10 @@ class Lexer(object): | |||||
""" | """ | ||||
lex = NotImplemented | lex = NotImplemented | ||||
def make_lexer_state(self, text): | |||||
line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n') | |||||
return LexerState(text, line_ctr) | |||||
class TraditionalLexer(Lexer): | class TraditionalLexer(Lexer): | ||||
@@ -328,26 +333,21 @@ class TraditionalLexer(Lexer): | |||||
if m: | if m: | ||||
return m.group(0), type_from_index[m.lastindex] | return m.group(0), type_from_index[m.lastindex] | ||||
def make_lexer_state(self, text): | |||||
line_ctr = LineCounter('\n' if not self.use_bytes else b'\n') | |||||
return LexerState(text, line_ctr) | |||||
def lex(self, text): | |||||
state = self.make_lexer_state(text) | |||||
def lex(self, state, parser_state): | |||||
with suppress(EOFError): | with suppress(EOFError): | ||||
while True: | while True: | ||||
yield self.next_token(state) | yield self.next_token(state) | ||||
def next_token(self, lex_state): | def next_token(self, lex_state): | ||||
text = lex_state.text | |||||
line_ctr = lex_state.line_ctr | line_ctr = lex_state.line_ctr | ||||
while line_ctr.char_pos < len(text): | |||||
res = self.match(text, line_ctr.char_pos) | |||||
while line_ctr.char_pos < len(lex_state.text): | |||||
res = self.match(lex_state.text, line_ctr.char_pos) | |||||
if not res: | if not res: | ||||
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types | allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types | ||||
if not allowed: | if not allowed: | ||||
allowed = {"<END-OF-FILE>"} | allowed = {"<END-OF-FILE>"} | ||||
raise UnexpectedCharacters(text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) | |||||
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | |||||
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) | |||||
value, type_ = res | value, type_ = res | ||||
@@ -373,11 +373,15 @@ class TraditionalLexer(Lexer): | |||||
raise EOFError(self) | raise EOFError(self) | ||||
class LexerState: | class LexerState: | ||||
__slots__ = 'text', 'line_ctr', 'last_token' | |||||
def __init__(self, text, line_ctr, last_token=None): | def __init__(self, text, line_ctr, last_token=None): | ||||
self.text = text | self.text = text | ||||
self.line_ctr = line_ctr | self.line_ctr = line_ctr | ||||
self.last_token = last_token | self.last_token = last_token | ||||
def __copy__(self): | |||||
return type(self)(self.text, copy(self.line_ctr), self.last_token) | |||||
class ContextualLexer(Lexer): | class ContextualLexer(Lexer): | ||||
@@ -410,24 +414,29 @@ class ContextualLexer(Lexer): | |||||
assert trad_conf.tokens is terminals | assert trad_conf.tokens is terminals | ||||
self.root_lexer = TraditionalLexer(trad_conf) | self.root_lexer = TraditionalLexer(trad_conf) | ||||
def lex(self, text, get_parser_state): | |||||
state = self.root_lexer.make_lexer_state(text) | |||||
def make_lexer_state(self, text): | |||||
return self.root_lexer.make_lexer_state(text) | |||||
def lex(self, lexer_state, parser_state): | |||||
try: | try: | ||||
while True: | while True: | ||||
lexer = self.lexers[get_parser_state()] | |||||
yield lexer.next_token(state) | |||||
lexer = self.lexers[parser_state.position] | |||||
yield lexer.next_token(lexer_state) | |||||
except EOFError: | except EOFError: | ||||
pass | pass | ||||
except UnexpectedCharacters as e: | except UnexpectedCharacters as e: | ||||
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, | |||||
# but not in the current context. | |||||
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | |||||
# This tests the input against the global context, to provide a nicer error. | # This tests the input against the global context, to provide a nicer error. | ||||
root_match = self.root_lexer.match(text, e.pos_in_stream) | |||||
if not root_match: | |||||
raise | |||||
token = self.root_lexer.next_token(lexer_state) | |||||
raise UnexpectedToken(token, e.allowed, state=parser_state.position) | |||||
class LexerThread: | |||||
"A thread that ties a lexer instance and a lexer state, to be used by the parser" | |||||
value, type_ = root_match | |||||
t = Token(type_, value, e.pos_in_stream, e.line, e.column) | |||||
raise UnexpectedToken(t, e.allowed, state=get_parser_state()) | |||||
def __init__(self, lexer, text): | |||||
self.lexer = lexer | |||||
self.state = lexer.make_lexer_state(text) | |||||
def lex(self, parser_state): | |||||
return self.lexer.lex(self.state, parser_state) | |||||
###} | ###} |
@@ -1,6 +1,6 @@ | |||||
from .utils import get_regexp_width, Serialize | from .utils import get_regexp_width, Serialize | ||||
from .parsers.grammar_analysis import GrammarAnalyzer | from .parsers.grammar_analysis import GrammarAnalyzer | ||||
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||||
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||||
from .parsers import earley, xearley, cyk | from .parsers import earley, xearley, cyk | ||||
from .parsers.lalr_parser import LALR_Parser | from .parsers.lalr_parser import LALR_Parser | ||||
from .grammar import Rule | from .grammar import Rule | ||||
@@ -23,12 +23,18 @@ def get_frontend(parser, lexer): | |||||
elif lexer == 'contextual': | elif lexer == 'contextual': | ||||
return LALR_ContextualLexer | return LALR_ContextualLexer | ||||
elif issubclass(lexer, Lexer): | elif issubclass(lexer, Lexer): | ||||
class CustomLexerWrapper(Lexer): | |||||
def __init__(self, lexer_conf): | |||||
self.lexer = lexer(lexer_conf) | |||||
def lex(self, lexer_state, parser_state): | |||||
return self.lexer.lex(lexer_state.text) | |||||
class LALR_CustomLexerWrapper(LALR_CustomLexer): | class LALR_CustomLexerWrapper(LALR_CustomLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
super(LALR_CustomLexerWrapper, self).__init__( | super(LALR_CustomLexerWrapper, self).__init__( | ||||
lexer, lexer_conf, parser_conf, options=options) | lexer, lexer_conf, parser_conf, options=options) | ||||
def init_lexer(self): | def init_lexer(self): | ||||
self.lexer = lexer(self.lexer_conf) | |||||
self.lexer = CustomLexerWrapper(self.lexer_conf) | |||||
return LALR_CustomLexerWrapper | return LALR_CustomLexerWrapper | ||||
else: | else: | ||||
@@ -54,7 +60,7 @@ def get_frontend(parser, lexer): | |||||
class _ParserFrontend(Serialize): | class _ParserFrontend(Serialize): | ||||
def _parse(self, input, start, *args): | |||||
def _parse(self, start, input, *args): | |||||
if start is None: | if start is None: | ||||
start = self.start | start = self.start | ||||
if len(start) > 1: | if len(start) > 1: | ||||
@@ -71,6 +77,18 @@ def _get_lexer_callbacks(transformer, terminals): | |||||
result[terminal.name] = callback | result[terminal.name] = callback | ||||
return result | return result | ||||
class PostLexConnector: | |||||
def __init__(self, lexer, postlexer): | |||||
self.lexer = lexer | |||||
self.postlexer = postlexer | |||||
def make_lexer_state(self, text): | |||||
return self.lexer.make_lexer_state(text) | |||||
def lex(self, lexer_state, parser_state): | |||||
i = self.lexer.lex(lexer_state, parser_state) | |||||
return self.postlexer.process(i) | |||||
class WithLexer(_ParserFrontend): | class WithLexer(_ParserFrontend): | ||||
lexer = None | lexer = None | ||||
@@ -106,13 +124,14 @@ class WithLexer(_ParserFrontend): | |||||
def _serialize(self, data, memo): | def _serialize(self, data, memo): | ||||
data['parser'] = data['parser'].serialize(memo) | data['parser'] = data['parser'].serialize(memo) | ||||
def lex(self, *args): | |||||
stream = self.lexer.lex(*args) | |||||
return self.postlex.process(stream) if self.postlex else stream | |||||
def make_lexer(self, text): | |||||
lexer = self.lexer | |||||
if self.postlex: | |||||
lexer = PostLexConnector(self.lexer, self.postlex) | |||||
return LexerThread(lexer, text) | |||||
def parse(self, text, start=None): | def parse(self, text, start=None): | ||||
token_stream = self.lex(text) | |||||
return self._parse(token_stream, start) | |||||
return self._parse(start, self.make_lexer(text)) | |||||
def init_traditional_lexer(self): | def init_traditional_lexer(self): | ||||
self.lexer = TraditionalLexer(self.lexer_conf) | self.lexer = TraditionalLexer(self.lexer_conf) | ||||
@@ -138,14 +157,6 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||||
always_accept = self.postlex.always_accept if self.postlex else () | always_accept = self.postlex.always_accept if self.postlex else () | ||||
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) | self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) | ||||
def parse(self, text, start=None): | |||||
parser_state = [None] | |||||
def set_parser_state(s): | |||||
parser_state[0] = s | |||||
token_stream = self.lex(text, lambda: parser_state[0]) | |||||
return self._parse(token_stream, start, set_parser_state) | |||||
###} | ###} | ||||
class LALR_CustomLexer(LALR_WithLexer): | class LALR_CustomLexer(LALR_WithLexer): | ||||
@@ -156,15 +167,6 @@ class LALR_CustomLexer(LALR_WithLexer): | |||||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | WithLexer.__init__(self, lexer_conf, parser_conf, options) | ||||
def tokenize_text(text): | |||||
line = 1 | |||||
col_start_pos = 0 | |||||
for i, ch in enumerate(text): | |||||
if '\n' in ch: | |||||
line += ch.count('\n') | |||||
col_start_pos = i + ch.rindex('\n') | |||||
yield Token('CHAR', ch, line=line, column=i - col_start_pos) | |||||
class Earley(WithLexer): | class Earley(WithLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | WithLexer.__init__(self, lexer_conf, parser_conf, options) | ||||
@@ -175,6 +177,9 @@ class Earley(WithLexer): | |||||
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None | tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None | ||||
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) | self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) | ||||
def make_lexer(self, text): | |||||
return WithLexer.make_lexer(self, text).lex(None) | |||||
def match(self, term, token): | def match(self, term, token): | ||||
return term.name == token.type | return term.name == token.type | ||||
@@ -219,7 +224,7 @@ class XEarley(_ParserFrontend): | |||||
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | ||||
def parse(self, text, start): | def parse(self, text, start): | ||||
return self._parse(text, start) | |||||
return self._parse(start, text) | |||||
class XEarley_CompleteLex(XEarley): | class XEarley_CompleteLex(XEarley): | ||||
def __init__(self, *args, **kw): | def __init__(self, *args, **kw): | ||||
@@ -239,8 +244,8 @@ class CYK(WithLexer): | |||||
self.callbacks = parser_conf.callbacks | self.callbacks = parser_conf.callbacks | ||||
def parse(self, text, start): | def parse(self, text, start): | ||||
tokens = list(self.lex(text)) | |||||
parse = self._parse(tokens, start) | |||||
tokens = list(self.make_lexer(text).lex(None)) | |||||
parse = self._parse(start, tokens) | |||||
parse = self._transform(parse) | parse = self._transform(parse) | ||||
return parse | return parse | ||||
@@ -2,9 +2,9 @@ | |||||
""" | """ | ||||
# Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
# Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
from ..exceptions import UnexpectedToken | |||||
from copy import deepcopy | |||||
from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||||
from ..lexer import Token | from ..lexer import Token | ||||
from ..utils import Enumerator, Serialize | |||||
from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | ||||
from .lalr_puppet import ParserPuppet | from .lalr_puppet import ParserPuppet | ||||
@@ -35,84 +35,116 @@ class LALR_Parser(object): | |||||
return self.parser.parse(*args) | return self.parser.parse(*args) | ||||
class _Parser: | |||||
def __init__(self, parse_table, callbacks, debug=False): | |||||
self.parse_table = parse_table | |||||
self.callbacks = callbacks | |||||
self.debug = debug | |||||
class ParserState: | |||||
__slots__ = 'parse_table', 'lexer', 'callbacks', 'start', 'state_stack', 'value_stack', 'start_state', 'end_state', 'states' | |||||
def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None): | |||||
token = None | |||||
stream = iter(seq) | |||||
states = self.parse_table.states | |||||
start_state = self.parse_table.start_states[start] | |||||
end_state = self.parse_table.end_states[start] | |||||
def __init__(self, parse_table, lexer, callbacks, start, state_stack=None, value_stack=None): | |||||
self.parse_table = parse_table | |||||
state_stack = state_stack or [start_state] | |||||
value_stack = value_stack or [] | |||||
self.start_state = self.parse_table.start_states[start] | |||||
self.end_state = self.parse_table.end_states[start] | |||||
self.states = self.parse_table.states | |||||
if set_state: set_state(start_state) | |||||
self.lexer = lexer | |||||
self.callbacks = callbacks | |||||
self.start = start | |||||
self.state_stack = state_stack or [self.start_state] | |||||
self.value_stack = value_stack or [] | |||||
@property | |||||
def position(self): | |||||
return self.state_stack[-1] | |||||
def __copy__(self): | |||||
return type(self)( | |||||
self.parse_table, | |||||
self.lexer, # XXX copy | |||||
self.callbacks, | |||||
self.start, | |||||
list(self.state_stack), | |||||
deepcopy(self.value_stack), | |||||
) | |||||
def feed_token(self, token, is_end=False): | |||||
state_stack = self.state_stack | |||||
value_stack = self.value_stack | |||||
states = self.states | |||||
def get_action(token): | |||||
while True: | |||||
state = state_stack[-1] | state = state_stack[-1] | ||||
try: | try: | ||||
return states[state][token.type] | |||||
action, arg = states[state][token.type] | |||||
except KeyError: | except KeyError: | ||||
expected = {s for s in states[state].keys() if s.isupper()} | expected = {s for s in states[state].keys() if s.isupper()} | ||||
try: | |||||
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | |||||
except NameError: # For standalone parser | |||||
puppet = None | |||||
raise UnexpectedToken(token, expected, state=state, puppet=puppet) | |||||
def reduce(rule): | |||||
size = len(rule.expansion) | |||||
if size: | |||||
s = value_stack[-size:] | |||||
del state_stack[-size:] | |||||
del value_stack[-size:] | |||||
raise UnexpectedToken(token, expected, state=state, puppet=None) | |||||
assert arg != self.end_state | |||||
if action is Shift: | |||||
# shift once and return | |||||
assert not is_end | |||||
state_stack.append(arg) | |||||
value_stack.append(token) | |||||
return arg | |||||
else: | else: | ||||
s = [] | |||||
# reduce+shift as many times as necessary | |||||
rule = arg | |||||
size = len(rule.expansion) | |||||
if size: | |||||
s = value_stack[-size:] | |||||
del state_stack[-size:] | |||||
del value_stack[-size:] | |||||
else: | |||||
s = [] | |||||
value = self.callbacks[rule](s) | |||||
_action, new_state = states[state_stack[-1]][rule.origin.name] | |||||
assert _action is Shift | |||||
state_stack.append(new_state) | |||||
value_stack.append(value) | |||||
if is_end and state_stack[-1] == self.end_state: | |||||
return value_stack[-1] | |||||
value = self.callbacks[rule](s) | |||||
class _Parser: | |||||
def __init__(self, parse_table, callbacks, debug=False): | |||||
self.parse_table = parse_table | |||||
self.callbacks = callbacks | |||||
self.debug = debug | |||||
_action, new_state = states[state_stack[-1]][rule.origin.name] | |||||
assert _action is Shift | |||||
state_stack.append(new_state) | |||||
value_stack.append(value) | |||||
def parse(self, lexer, start, value_stack=None, state_stack=None): | |||||
parser_state = ParserState(self.parse_table, lexer, self.callbacks, start, state_stack, value_stack) | |||||
return self.parse_from_state(parser_state) | |||||
def parse_from_state(self, state): | |||||
# Main LALR-parser loop | # Main LALR-parser loop | ||||
try: | try: | ||||
for token in stream: | |||||
while True: | |||||
action, arg = get_action(token) | |||||
assert arg != end_state | |||||
if action is Shift: | |||||
state_stack.append(arg) | |||||
value_stack.append(token) | |||||
if set_state: set_state(arg) | |||||
break # next token | |||||
else: | |||||
reduce(arg) | |||||
token = None | |||||
for token in state.lexer.lex(state): | |||||
state.feed_token(token) | |||||
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) | |||||
return state.feed_token(token, True) | |||||
except UnexpectedInput as e: | |||||
try: | |||||
e.puppet = ParserPuppet(self, state, state.lexer) | |||||
except NameError: | |||||
pass | |||||
if isinstance(e, UnexpectedCharacters): | |||||
s = state.lexer.state | |||||
p = s.line_ctr.char_pos | |||||
s.line_ctr.feed(s.text[p:p+1]) | |||||
raise e | |||||
except Exception as e: | except Exception as e: | ||||
if self.debug: | if self.debug: | ||||
print("") | print("") | ||||
print("STATE STACK DUMP") | print("STATE STACK DUMP") | ||||
print("----------------") | print("----------------") | ||||
for i, s in enumerate(state_stack): | |||||
for i, s in enumerate(state.state_stack): | |||||
print('%d)' % i , s) | print('%d)' % i , s) | ||||
print("") | print("") | ||||
raise | raise | ||||
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) | |||||
while True: | |||||
_action, arg = get_action(token) | |||||
assert(_action is Reduce) | |||||
reduce(arg) | |||||
if state_stack[-1] == end_state: | |||||
return value_stack[-1] | |||||
###} | ###} | ||||
@@ -1,10 +1,10 @@ | |||||
# This module provide a LALR puppet, which is used to debugging and error handling | # This module provide a LALR puppet, which is used to debugging and error handling | ||||
from copy import deepcopy | |||||
from copy import copy | |||||
from .lalr_analysis import Shift, Reduce | from .lalr_analysis import Shift, Reduce | ||||
from .. import Token | from .. import Token | ||||
from ..exceptions import ParseError | |||||
from ..exceptions import UnexpectedToken | |||||
class ParserPuppet(object): | class ParserPuppet(object): | ||||
@@ -12,96 +12,44 @@ class ParserPuppet(object): | |||||
For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``. | For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``. | ||||
""" | """ | ||||
def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | |||||
def __init__(self, parser, parser_state, lexer_state): | |||||
self.parser = parser | self.parser = parser | ||||
self._state_stack = state_stack | |||||
self._value_stack = value_stack | |||||
self._start = start | |||||
self._stream = stream | |||||
self._set_state = set_state | |||||
self.result = None | |||||
self.parser_state = parser_state | |||||
self.lexer_state = lexer_state | |||||
def feed_token(self, token): | def feed_token(self, token): | ||||
"""Feed the parser with a token, and advance it to the next state, as if it recieved it from the lexer. | """Feed the parser with a token, and advance it to the next state, as if it recieved it from the lexer. | ||||
Note that ``token`` has to be an instance of ``Token``. | Note that ``token`` has to be an instance of ``Token``. | ||||
""" | """ | ||||
end_state = self.parser.parse_table.end_states[self._start] | |||||
state_stack = self._state_stack | |||||
value_stack = self._value_stack | |||||
state = state_stack[-1] | |||||
action, arg = self.parser.parse_table.states[state][token.type] | |||||
if arg == end_state: | |||||
raise ParseError(arg) | |||||
while action is Reduce: | |||||
rule = arg | |||||
size = len(rule.expansion) | |||||
if size: | |||||
s = value_stack[-size:] | |||||
del state_stack[-size:] | |||||
del value_stack[-size:] | |||||
else: | |||||
s = [] | |||||
value = self.parser.callbacks[rule](s) | |||||
_action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name] | |||||
assert _action is Shift | |||||
state_stack.append(new_state) | |||||
value_stack.append(value) | |||||
if state_stack[-1] == end_state: | |||||
self.result = value_stack[-1] | |||||
return self.result | |||||
state = state_stack[-1] | |||||
try: | |||||
action, arg = self.parser.parse_table.states[state][token.type] | |||||
except KeyError as e: | |||||
raise ParseError(e) | |||||
assert arg != end_state | |||||
assert action is Shift | |||||
state_stack.append(arg) | |||||
value_stack.append(token) | |||||
def copy(self): | |||||
return self.parser_state.feed_token(token) | |||||
def __copy__(self): | |||||
"""Create a new puppet with a separate state. | """Create a new puppet with a separate state. | ||||
Calls to feed_token() won't affect the old puppet, and vice-versa. | Calls to feed_token() won't affect the old puppet, and vice-versa. | ||||
""" | """ | ||||
return type(self)( | return type(self)( | ||||
self.parser, | self.parser, | ||||
list(self._state_stack), | |||||
deepcopy(self._value_stack), | |||||
self._start, | |||||
self._stream, | |||||
self._set_state, | |||||
copy(self.parser_state), | |||||
copy(self.lexer_state), | |||||
) | ) | ||||
def __eq__(self, other): | def __eq__(self, other): | ||||
if not isinstance(other, ParserPuppet): | if not isinstance(other, ParserPuppet): | ||||
return False | return False | ||||
return ( | |||||
self._state_stack == other._state_stack and | |||||
self._value_stack == other._value_stack and | |||||
self._stream == other._stream and | |||||
self._start == other._start | |||||
) | |||||
return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state | |||||
def __hash__(self): | def __hash__(self): | ||||
return hash((tuple(self._state_stack), self._start)) | |||||
return hash((self.parser_state, self.lexer_state)) | |||||
def pretty(self): | def pretty(self): | ||||
"""Print the output of ``choices()`` in a way that's easier to read.""" | """Print the output of ``choices()`` in a way that's easier to read.""" | ||||
out = ["Puppet choices:"] | out = ["Puppet choices:"] | ||||
for k, v in self.choices().items(): | for k, v in self.choices().items(): | ||||
out.append('\t- %s -> %s' % (k, v)) | out.append('\t- %s -> %s' % (k, v)) | ||||
out.append('stack size: %s' % len(self._state_stack)) | |||||
out.append('stack size: %s' % len(self.parser_state.state_stack)) | |||||
return '\n'.join(out) | return '\n'.join(out) | ||||
def choices(self): | def choices(self): | ||||
@@ -111,16 +59,16 @@ class ParserPuppet(object): | |||||
Updated by ``feed_token()``. | Updated by ``feed_token()``. | ||||
""" | """ | ||||
return self.parser.parse_table.states[self._state_stack[-1]] | |||||
return self.parser_state.parse_table.states[self.parser_state.position] | |||||
def accepts(self): | def accepts(self): | ||||
accepts = set() | accepts = set() | ||||
for t in self.choices(): | for t in self.choices(): | ||||
if t.isupper(): # is terminal? | if t.isupper(): # is terminal? | ||||
new_puppet = self.copy() | |||||
new_puppet = copy(self) | |||||
try: | try: | ||||
new_puppet.feed_token(Token(t, '')) | new_puppet.feed_token(Token(t, '')) | ||||
except ParseError: | |||||
except UnexpectedToken: | |||||
pass | pass | ||||
else: | else: | ||||
accepts.add(t) | accepts.add(t) | ||||
@@ -128,7 +76,4 @@ class ParserPuppet(object): | |||||
def resume_parse(self): | def resume_parse(self): | ||||
"""Resume parsing from the current puppet state.""" | """Resume parsing from the current puppet state.""" | ||||
return self.parser.parse( | |||||
self._stream, self._start, self._set_state, | |||||
self._value_stack, self._state_stack | |||||
) | |||||
return self.parser.parse_from_state(self.parser_state) |
@@ -2217,6 +2217,42 @@ def _make_parser_test(LEXER, PARSER): | |||||
""", regex=True) | """, regex=True) | ||||
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | ||||
@unittest.skipIf(PARSER!='lalr', "Puppet error handling only works with LALR for now") | |||||
def test_error_with_puppet(self): | |||||
def ignore_errors(e): | |||||
if isinstance(e, UnexpectedCharacters): | |||||
# Skip bad character | |||||
return True | |||||
# Must be UnexpectedToken | |||||
if e.token.type == 'COMMA': | |||||
# Skip comma | |||||
return True | |||||
elif e.token.type == 'SIGNED_NUMBER': | |||||
# Try to feed a comma and retry the number | |||||
e.puppet.feed_token(Token('COMMA', ',')) | |||||
e.puppet.feed_token(e.token) | |||||
return True | |||||
# Unhandled error. Will stop parse and raise exception | |||||
return False | |||||
g = _Lark(r''' | |||||
start: "[" num ("," num)* "]" | |||||
?num: SIGNED_NUMBER | |||||
%import common.SIGNED_NUMBER | |||||
%ignore " " | |||||
''') | |||||
s = "[0 1, 2,, 3,,, 4, 5 6 ]" | |||||
tree = g.parse(s, on_error=ignore_errors) | |||||
res = [int(x) for x in tree.children] | |||||
assert res == list(range(7)) | |||||
s = "[0 1, 2,@, 3,,, 4, 5 6 ]$" | |||||
tree = g.parse(s, on_error=ignore_errors) | |||||
_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | ||||
_TestParser.__name__ = _NAME | _TestParser.__name__ = _NAME | ||||
_TestParser.__qualname__ = "tests.test_parser." + _NAME | _TestParser.__qualname__ = "tests.test_parser." + _NAME | ||||