@@ -1,4 +1,5 @@ | |||||
from __future__ import absolute_import | from __future__ import absolute_import | ||||
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||||
import sys, os, pickle, hashlib | import sys, os, pickle, hashlib | ||||
from io import open | from io import open | ||||
@@ -9,7 +10,7 @@ from .load_grammar import load_grammar | |||||
from .tree import Tree | from .tree import Tree | ||||
from .common import LexerConf, ParserConf | from .common import LexerConf, ParserConf | ||||
from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken | |||||
from .lexer import Lexer, TraditionalLexer, TerminalDef | |||||
from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
from .parser_frontends import get_frontend, _get_lexer_callbacks | from .parser_frontends import get_frontend, _get_lexer_callbacks | ||||
from .grammar import Rule | from .grammar import Rule | ||||
@@ -462,20 +463,32 @@ class Lark(Serialize): | |||||
try: | try: | ||||
return self.parser.parse(text, start=start) | return self.parser.parse(text, start=start) | ||||
except UnexpectedToken as e: | |||||
except UnexpectedInput as e: | |||||
if on_error is None: | if on_error is None: | ||||
raise | raise | ||||
while True: | while True: | ||||
if isinstance(e, UnexpectedCharacters): | |||||
s = e.puppet.lexer_state.state | |||||
p = s.line_ctr.char_pos | |||||
if not on_error(e): | if not on_error(e): | ||||
raise e | raise e | ||||
if isinstance(e, UnexpectedCharacters): | |||||
# If user didn't change the character position, then we should | |||||
if p == s.line_ctr.char_pos: | |||||
s.line_ctr.feed(s.text[p:p+1]) | |||||
try: | try: | ||||
return e.puppet.resume_parse() | return e.puppet.resume_parse() | ||||
except UnexpectedToken as e2: | except UnexpectedToken as e2: | ||||
if e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||||
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||||
# Prevent infinite loop | # Prevent infinite loop | ||||
raise e2 | raise e2 | ||||
e = e2 | e = e2 | ||||
except UnexpectedCharacters as e2: | |||||
e = e2 | |||||
###} | ###} |
@@ -2,7 +2,7 @@ | |||||
import re | import re | ||||
from .utils import Str, classify, get_regexp_width, Py36, Serialize | |||||
from .utils import Str, classify, get_regexp_width, Py36, Serialize, suppress | |||||
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken | from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken | ||||
###{standalone | ###{standalone | ||||
@@ -157,6 +157,8 @@ class Token(Str): | |||||
class LineCounter: | class LineCounter: | ||||
__slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char' | |||||
def __init__(self, newline_char): | def __init__(self, newline_char): | ||||
self.newline_char = newline_char | self.newline_char = newline_char | ||||
self.char_pos = 0 | self.char_pos = 0 | ||||
@@ -167,7 +169,7 @@ class LineCounter: | |||||
def feed(self, token, test_newline=True): | def feed(self, token, test_newline=True): | ||||
"""Consume a token and calculate the new line & column. | """Consume a token and calculate the new line & column. | ||||
As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||||
As an optional optimization, set test_newline=False if token doesn't contain a newline. | |||||
""" | """ | ||||
if test_newline: | if test_newline: | ||||
newlines = token.count(self.newline_char) | newlines = token.count(self.newline_char) | ||||
@@ -178,49 +180,6 @@ class LineCounter: | |||||
self.char_pos += len(token) | self.char_pos += len(token) | ||||
self.column = self.char_pos - self.line_start_pos + 1 | self.column = self.char_pos - self.line_start_pos + 1 | ||||
class _Lex: | |||||
"Built to serve both Lexer and ContextualLexer" | |||||
def __init__(self, lexer, state=None): | |||||
self.lexer = lexer | |||||
self.state = state | |||||
def lex(self, stream, newline_types, ignore_types): | |||||
newline_types = frozenset(newline_types) | |||||
ignore_types = frozenset(ignore_types) | |||||
line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n') | |||||
last_token = None | |||||
while line_ctr.char_pos < len(stream): | |||||
lexer = self.lexer | |||||
res = lexer.match(stream, line_ctr.char_pos) | |||||
if not res: | |||||
allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types | |||||
if not allowed: | |||||
allowed = {"<END-OF-FILE>"} | |||||
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) | |||||
value, type_ = res | |||||
if type_ not in ignore_types: | |||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
line_ctr.feed(value, type_ in newline_types) | |||||
t.end_line = line_ctr.line | |||||
t.end_column = line_ctr.column | |||||
t.end_pos = line_ctr.char_pos | |||||
if t.type in lexer.callback: | |||||
t = lexer.callback[t.type](t) | |||||
if not isinstance(t, Token): | |||||
raise ValueError("Callbacks must return a token (returned %r)" % t) | |||||
yield t | |||||
last_token = t | |||||
else: | |||||
if type_ in lexer.callback: | |||||
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
lexer.callback[type_](t2) | |||||
line_ctr.feed(value, type_ in newline_types) | |||||
class UnlessCallback: | class UnlessCallback: | ||||
def __init__(self, mres): | def __init__(self, mres): | ||||
@@ -286,7 +245,6 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes) | |||||
except AssertionError: # Yes, this is what Python provides us.. :/ | except AssertionError: # Yes, this is what Python provides us.. :/ | ||||
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) | return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) | ||||
# terms_from_name = {t.name: t for t in terminals[:max_size]} | |||||
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | ||||
terminals = terminals[max_size:] | terminals = terminals[max_size:] | ||||
return mres | return mres | ||||
@@ -308,10 +266,14 @@ class Lexer(object): | |||||
"""Lexer interface | """Lexer interface | ||||
Method Signatures: | Method Signatures: | ||||
lex(self, stream) -> Iterator[Token] | |||||
lex(self, text) -> Iterator[Token] | |||||
""" | """ | ||||
lex = NotImplemented | lex = NotImplemented | ||||
def make_lexer_state(self, text): | |||||
line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n') | |||||
return LexerState(text, line_ctr) | |||||
class TraditionalLexer(Lexer): | class TraditionalLexer(Lexer): | ||||
@@ -335,8 +297,8 @@ class TraditionalLexer(Lexer): | |||||
assert set(conf.ignore) <= {t.name for t in terminals} | assert set(conf.ignore) <= {t.name for t in terminals} | ||||
# Init | # Init | ||||
self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] | |||||
self.ignore_types = list(conf.ignore) | |||||
self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())) | |||||
self.ignore_types = frozenset(conf.ignore) | |||||
terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) | terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) | ||||
self.terminals = terminals | self.terminals = terminals | ||||
@@ -345,7 +307,6 @@ class TraditionalLexer(Lexer): | |||||
self.use_bytes = conf.use_bytes | self.use_bytes = conf.use_bytes | ||||
self._mres = None | self._mres = None | ||||
# self.build(g_regex_flags) | |||||
def _build(self): | def _build(self): | ||||
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) | terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) | ||||
@@ -366,17 +327,61 @@ class TraditionalLexer(Lexer): | |||||
self._build() | self._build() | ||||
return self._mres | return self._mres | ||||
def match(self, stream, pos): | |||||
def match(self, text, pos): | |||||
for mre, type_from_index in self.mres: | for mre, type_from_index in self.mres: | ||||
m = mre.match(stream, pos) | |||||
m = mre.match(text, pos) | |||||
if m: | if m: | ||||
return m.group(0), type_from_index[m.lastindex] | return m.group(0), type_from_index[m.lastindex] | ||||
def lex(self, stream): | |||||
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | |||||
def lex(self, state, parser_state): | |||||
with suppress(EOFError): | |||||
while True: | |||||
yield self.next_token(state) | |||||
def next_token(self, lex_state): | |||||
line_ctr = lex_state.line_ctr | |||||
while line_ctr.char_pos < len(lex_state.text): | |||||
res = self.match(lex_state.text, line_ctr.char_pos) | |||||
if not res: | |||||
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types | |||||
if not allowed: | |||||
allowed = {"<END-OF-FILE>"} | |||||
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | |||||
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) | |||||
value, type_ = res | |||||
if type_ not in self.ignore_types: | |||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
line_ctr.feed(value, type_ in self.newline_types) | |||||
t.end_line = line_ctr.line | |||||
t.end_column = line_ctr.column | |||||
t.end_pos = line_ctr.char_pos | |||||
if t.type in self.callback: | |||||
t = self.callback[t.type](t) | |||||
if not isinstance(t, Token): | |||||
raise ValueError("Callbacks must return a token (returned %r)" % t) | |||||
lex_state.last_token = t | |||||
return t | |||||
else: | |||||
if type_ in self.callback: | |||||
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
self.callback[type_](t2) | |||||
line_ctr.feed(value, type_ in self.newline_types) | |||||
# EOF | |||||
raise EOFError(self) | |||||
class LexerState: | |||||
__slots__ = 'text', 'line_ctr', 'last_token' | |||||
def __init__(self, text, line_ctr, last_token=None): | |||||
self.text = text | |||||
self.line_ctr = line_ctr | |||||
self.last_token = last_token | |||||
def __copy__(self): | |||||
return type(self)(self.text, copy(self.line_ctr), self.last_token) | |||||
class ContextualLexer(Lexer): | class ContextualLexer(Lexer): | ||||
@@ -409,25 +414,29 @@ class ContextualLexer(Lexer): | |||||
assert trad_conf.tokens is terminals | assert trad_conf.tokens is terminals | ||||
self.root_lexer = TraditionalLexer(trad_conf) | self.root_lexer = TraditionalLexer(trad_conf) | ||||
def lex(self, stream, get_parser_state): | |||||
parser_state = get_parser_state() | |||||
l = _Lex(self.lexers[parser_state], parser_state) | |||||
def make_lexer_state(self, text): | |||||
return self.root_lexer.make_lexer_state(text) | |||||
def lex(self, lexer_state, parser_state): | |||||
try: | try: | ||||
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): | |||||
yield x | |||||
parser_state = get_parser_state() | |||||
l.lexer = self.lexers[parser_state] | |||||
l.state = parser_state # For debug only, no need to worry about multithreading | |||||
while True: | |||||
lexer = self.lexers[parser_state.position] | |||||
yield lexer.next_token(lexer_state) | |||||
except EOFError: | |||||
pass | |||||
except UnexpectedCharacters as e: | except UnexpectedCharacters as e: | ||||
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, | |||||
# but not in the current context. | |||||
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | |||||
# This tests the input against the global context, to provide a nicer error. | # This tests the input against the global context, to provide a nicer error. | ||||
root_match = self.root_lexer.match(stream, e.pos_in_stream) | |||||
if not root_match: | |||||
raise | |||||
token = self.root_lexer.next_token(lexer_state) | |||||
raise UnexpectedToken(token, e.allowed, state=parser_state.position) | |||||
value, type_ = root_match | |||||
t = Token(type_, value, e.pos_in_stream, e.line, e.column) | |||||
raise UnexpectedToken(t, e.allowed, state=e.state) | |||||
class LexerThread: | |||||
"A thread that ties a lexer instance and a lexer state, to be used by the parser" | |||||
def __init__(self, lexer, text): | |||||
self.lexer = lexer | |||||
self.state = lexer.make_lexer_state(text) | |||||
def lex(self, parser_state): | |||||
return self.lexer.lex(self.state, parser_state) | |||||
###} | ###} |
@@ -1,6 +1,6 @@ | |||||
from .utils import get_regexp_width, Serialize | from .utils import get_regexp_width, Serialize | ||||
from .parsers.grammar_analysis import GrammarAnalyzer | from .parsers.grammar_analysis import GrammarAnalyzer | ||||
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||||
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||||
from .parsers import earley, xearley, cyk | from .parsers import earley, xearley, cyk | ||||
from .parsers.lalr_parser import LALR_Parser | from .parsers.lalr_parser import LALR_Parser | ||||
from .grammar import Rule | from .grammar import Rule | ||||
@@ -23,12 +23,22 @@ def get_frontend(parser, lexer): | |||||
elif lexer == 'contextual': | elif lexer == 'contextual': | ||||
return LALR_ContextualLexer | return LALR_ContextualLexer | ||||
elif issubclass(lexer, Lexer): | elif issubclass(lexer, Lexer): | ||||
class CustomLexerWrapper(Lexer): | |||||
def __init__(self, lexer_conf): | |||||
self.lexer = lexer(lexer_conf) | |||||
def lex(self, lexer_state, parser_state): | |||||
return self.lexer.lex(lexer_state.text) | |||||
class LALR_CustomLexerWrapper(LALR_CustomLexer): | class LALR_CustomLexerWrapper(LALR_CustomLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
super(LALR_CustomLexerWrapper, self).__init__( | super(LALR_CustomLexerWrapper, self).__init__( | ||||
lexer, lexer_conf, parser_conf, options=options) | lexer, lexer_conf, parser_conf, options=options) | ||||
def init_lexer(self): | def init_lexer(self): | ||||
self.lexer = lexer(self.lexer_conf) | |||||
future_interface = getattr(lexer, '__future_interface__', False) | |||||
if future_interface: | |||||
self.lexer = lexer(self.lexer_conf) | |||||
else: | |||||
self.lexer = CustomLexerWrapper(self.lexer_conf) | |||||
return LALR_CustomLexerWrapper | return LALR_CustomLexerWrapper | ||||
else: | else: | ||||
@@ -54,7 +64,7 @@ def get_frontend(parser, lexer): | |||||
class _ParserFrontend(Serialize): | class _ParserFrontend(Serialize): | ||||
def _parse(self, input, start, *args): | |||||
def _parse(self, start, input, *args): | |||||
if start is None: | if start is None: | ||||
start = self.start | start = self.start | ||||
if len(start) > 1: | if len(start) > 1: | ||||
@@ -71,6 +81,18 @@ def _get_lexer_callbacks(transformer, terminals): | |||||
result[terminal.name] = callback | result[terminal.name] = callback | ||||
return result | return result | ||||
class PostLexConnector: | |||||
def __init__(self, lexer, postlexer): | |||||
self.lexer = lexer | |||||
self.postlexer = postlexer | |||||
def make_lexer_state(self, text): | |||||
return self.lexer.make_lexer_state(text) | |||||
def lex(self, lexer_state, parser_state): | |||||
i = self.lexer.lex(lexer_state, parser_state) | |||||
return self.postlexer.process(i) | |||||
class WithLexer(_ParserFrontend): | class WithLexer(_ParserFrontend): | ||||
lexer = None | lexer = None | ||||
@@ -106,13 +128,14 @@ class WithLexer(_ParserFrontend): | |||||
def _serialize(self, data, memo): | def _serialize(self, data, memo): | ||||
data['parser'] = data['parser'].serialize(memo) | data['parser'] = data['parser'].serialize(memo) | ||||
def lex(self, *args): | |||||
stream = self.lexer.lex(*args) | |||||
return self.postlex.process(stream) if self.postlex else stream | |||||
def make_lexer(self, text): | |||||
lexer = self.lexer | |||||
if self.postlex: | |||||
lexer = PostLexConnector(self.lexer, self.postlex) | |||||
return LexerThread(lexer, text) | |||||
def parse(self, text, start=None): | def parse(self, text, start=None): | ||||
token_stream = self.lex(text) | |||||
return self._parse(token_stream, start) | |||||
return self._parse(start, self.make_lexer(text)) | |||||
def init_traditional_lexer(self): | def init_traditional_lexer(self): | ||||
self.lexer = TraditionalLexer(self.lexer_conf) | self.lexer = TraditionalLexer(self.lexer_conf) | ||||
@@ -138,14 +161,6 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||||
always_accept = self.postlex.always_accept if self.postlex else () | always_accept = self.postlex.always_accept if self.postlex else () | ||||
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) | self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) | ||||
def parse(self, text, start=None): | |||||
parser_state = [None] | |||||
def set_parser_state(s): | |||||
parser_state[0] = s | |||||
token_stream = self.lex(text, lambda: parser_state[0]) | |||||
return self._parse(token_stream, start, set_parser_state) | |||||
###} | ###} | ||||
class LALR_CustomLexer(LALR_WithLexer): | class LALR_CustomLexer(LALR_WithLexer): | ||||
@@ -156,15 +171,6 @@ class LALR_CustomLexer(LALR_WithLexer): | |||||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | WithLexer.__init__(self, lexer_conf, parser_conf, options) | ||||
def tokenize_text(text): | |||||
line = 1 | |||||
col_start_pos = 0 | |||||
for i, ch in enumerate(text): | |||||
if '\n' in ch: | |||||
line += ch.count('\n') | |||||
col_start_pos = i + ch.rindex('\n') | |||||
yield Token('CHAR', ch, line=line, column=i - col_start_pos) | |||||
class Earley(WithLexer): | class Earley(WithLexer): | ||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | WithLexer.__init__(self, lexer_conf, parser_conf, options) | ||||
@@ -175,6 +181,9 @@ class Earley(WithLexer): | |||||
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None | tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None | ||||
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) | self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) | ||||
def make_lexer(self, text): | |||||
return WithLexer.make_lexer(self, text).lex(None) | |||||
def match(self, term, token): | def match(self, term, token): | ||||
return term.name == token.type | return term.name == token.type | ||||
@@ -219,7 +228,7 @@ class XEarley(_ParserFrontend): | |||||
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | ||||
def parse(self, text, start): | def parse(self, text, start): | ||||
return self._parse(text, start) | |||||
return self._parse(start, text) | |||||
class XEarley_CompleteLex(XEarley): | class XEarley_CompleteLex(XEarley): | ||||
def __init__(self, *args, **kw): | def __init__(self, *args, **kw): | ||||
@@ -239,8 +248,8 @@ class CYK(WithLexer): | |||||
self.callbacks = parser_conf.callbacks | self.callbacks = parser_conf.callbacks | ||||
def parse(self, text, start): | def parse(self, text, start): | ||||
tokens = list(self.lex(text)) | |||||
parse = self._parse(tokens, start) | |||||
tokens = list(self.make_lexer(text).lex(None)) | |||||
parse = self._parse(start, tokens) | |||||
parse = self._transform(parse) | parse = self._transform(parse) | ||||
return parse | return parse | ||||
@@ -2,9 +2,9 @@ | |||||
""" | """ | ||||
# Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
# Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
from ..exceptions import UnexpectedToken | |||||
from copy import deepcopy, copy | |||||
from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||||
from ..lexer import Token | from ..lexer import Token | ||||
from ..utils import Enumerator, Serialize | |||||
from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | ||||
from .lalr_puppet import ParserPuppet | from .lalr_puppet import ParserPuppet | ||||
@@ -35,84 +35,123 @@ class LALR_Parser(object): | |||||
return self.parser.parse(*args) | return self.parser.parse(*args) | ||||
class _Parser: | |||||
def __init__(self, parse_table, callbacks, debug=False): | |||||
class ParseConf: | |||||
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | |||||
def __init__(self, parse_table, callbacks, start): | |||||
self.parse_table = parse_table | self.parse_table = parse_table | ||||
self.start_state = self.parse_table.start_states[start] | |||||
self.end_state = self.parse_table.end_states[start] | |||||
self.states = self.parse_table.states | |||||
self.callbacks = callbacks | self.callbacks = callbacks | ||||
self.debug = debug | |||||
self.start = start | |||||
def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None): | |||||
token = None | |||||
stream = iter(seq) | |||||
states = self.parse_table.states | |||||
start_state = self.parse_table.start_states[start] | |||||
end_state = self.parse_table.end_states[start] | |||||
state_stack = state_stack or [start_state] | |||||
value_stack = value_stack or [] | |||||
class ParserState: | |||||
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | |||||
if set_state: set_state(start_state) | |||||
def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | |||||
self.parse_conf = parse_conf | |||||
self.lexer = lexer | |||||
self.state_stack = state_stack or [self.parse_conf.start_state] | |||||
self.value_stack = value_stack or [] | |||||
def get_action(token): | |||||
@property | |||||
def position(self): | |||||
return self.state_stack[-1] | |||||
def __copy__(self): | |||||
return type(self)( | |||||
self.parse_conf, | |||||
self.lexer, # XXX copy | |||||
copy(self.state_stack), | |||||
deepcopy(self.value_stack), | |||||
) | |||||
def copy(self): | |||||
return copy(self) | |||||
def feed_token(self, token, is_end=False): | |||||
state_stack = self.state_stack | |||||
value_stack = self.value_stack | |||||
states = self.parse_conf.states | |||||
end_state = self.parse_conf.end_state | |||||
callbacks = self.parse_conf.callbacks | |||||
while True: | |||||
state = state_stack[-1] | state = state_stack[-1] | ||||
try: | try: | ||||
return states[state][token.type] | |||||
action, arg = states[state][token.type] | |||||
except KeyError: | except KeyError: | ||||
expected = {s for s in states[state].keys() if s.isupper()} | expected = {s for s in states[state].keys() if s.isupper()} | ||||
try: | |||||
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | |||||
except NameError: # For standalone parser | |||||
puppet = None | |||||
raise UnexpectedToken(token, expected, state=state, puppet=puppet) | |||||
def reduce(rule): | |||||
size = len(rule.expansion) | |||||
if size: | |||||
s = value_stack[-size:] | |||||
del state_stack[-size:] | |||||
del value_stack[-size:] | |||||
raise UnexpectedToken(token, expected, state=state, puppet=None) | |||||
assert arg != end_state | |||||
if action is Shift: | |||||
# shift once and return | |||||
assert not is_end | |||||
state_stack.append(arg) | |||||
value_stack.append(token) | |||||
return arg | |||||
else: | else: | ||||
s = [] | |||||
# reduce+shift as many times as necessary | |||||
rule = arg | |||||
size = len(rule.expansion) | |||||
if size: | |||||
s = value_stack[-size:] | |||||
del state_stack[-size:] | |||||
del value_stack[-size:] | |||||
else: | |||||
s = [] | |||||
value = callbacks[rule](s) | |||||
_action, new_state = states[state_stack[-1]][rule.origin.name] | |||||
assert _action is Shift | |||||
state_stack.append(new_state) | |||||
value_stack.append(value) | |||||
if is_end and state_stack[-1] == end_state: | |||||
return value_stack[-1] | |||||
value = self.callbacks[rule](s) | |||||
class _Parser: | |||||
def __init__(self, parse_table, callbacks, debug=False): | |||||
self.parse_table = parse_table | |||||
self.callbacks = callbacks | |||||
self.debug = debug | |||||
_action, new_state = states[state_stack[-1]][rule.origin.name] | |||||
assert _action is Shift | |||||
state_stack.append(new_state) | |||||
value_stack.append(value) | |||||
def parse(self, lexer, start, value_stack=None, state_stack=None): | |||||
parse_conf = ParseConf(self.parse_table, self.callbacks, start) | |||||
parser_state = ParserState(parse_conf, lexer, state_stack, value_stack) | |||||
return self.parse_from_state(parser_state) | |||||
def parse_from_state(self, state): | |||||
# Main LALR-parser loop | # Main LALR-parser loop | ||||
try: | try: | ||||
for token in stream: | |||||
while True: | |||||
action, arg = get_action(token) | |||||
assert arg != end_state | |||||
if action is Shift: | |||||
state_stack.append(arg) | |||||
value_stack.append(token) | |||||
if set_state: set_state(arg) | |||||
break # next token | |||||
else: | |||||
reduce(arg) | |||||
token = None | |||||
for token in state.lexer.lex(state): | |||||
state.feed_token(token) | |||||
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) | |||||
return state.feed_token(token, True) | |||||
except UnexpectedInput as e: | |||||
try: | |||||
e.puppet = ParserPuppet(self, state, state.lexer) | |||||
except NameError: | |||||
pass | |||||
raise e | |||||
except Exception as e: | except Exception as e: | ||||
if self.debug: | if self.debug: | ||||
print("") | print("") | ||||
print("STATE STACK DUMP") | print("STATE STACK DUMP") | ||||
print("----------------") | print("----------------") | ||||
for i, s in enumerate(state_stack): | |||||
for i, s in enumerate(state.state_stack): | |||||
print('%d)' % i , s) | print('%d)' % i , s) | ||||
print("") | print("") | ||||
raise | raise | ||||
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) | |||||
while True: | |||||
_action, arg = get_action(token) | |||||
assert(_action is Reduce) | |||||
reduce(arg) | |||||
if state_stack[-1] == end_state: | |||||
return value_stack[-1] | |||||
###} | ###} | ||||
@@ -1,10 +1,10 @@ | |||||
# This module provide a LALR puppet, which is used to debugging and error handling | # This module provide a LALR puppet, which is used to debugging and error handling | ||||
from copy import deepcopy | |||||
from copy import copy | |||||
from .lalr_analysis import Shift, Reduce | from .lalr_analysis import Shift, Reduce | ||||
from .. import Token | from .. import Token | ||||
from ..exceptions import ParseError | |||||
from ..exceptions import UnexpectedToken | |||||
class ParserPuppet(object): | class ParserPuppet(object): | ||||
@@ -12,96 +12,45 @@ class ParserPuppet(object): | |||||
For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``. | For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``. | ||||
""" | """ | ||||
def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | |||||
def __init__(self, parser, parser_state, lexer_state): | |||||
self.parser = parser | self.parser = parser | ||||
self._state_stack = state_stack | |||||
self._value_stack = value_stack | |||||
self._start = start | |||||
self._stream = stream | |||||
self._set_state = set_state | |||||
self.result = None | |||||
self.parser_state = parser_state | |||||
self.lexer_state = lexer_state | |||||
def feed_token(self, token): | def feed_token(self, token): | ||||
"""Feed the parser with a token, and advance it to the next state, as if it received it from the lexer. | """Feed the parser with a token, and advance it to the next state, as if it received it from the lexer. | ||||
Note that ``token`` has to be an instance of ``Token``. | Note that ``token`` has to be an instance of ``Token``. | ||||
""" | """ | ||||
end_state = self.parser.parse_table.end_states[self._start] | |||||
state_stack = self._state_stack | |||||
value_stack = self._value_stack | |||||
state = state_stack[-1] | |||||
action, arg = self.parser.parse_table.states[state][token.type] | |||||
if arg == end_state: | |||||
raise ParseError(arg) | |||||
while action is Reduce: | |||||
rule = arg | |||||
size = len(rule.expansion) | |||||
if size: | |||||
s = value_stack[-size:] | |||||
del state_stack[-size:] | |||||
del value_stack[-size:] | |||||
else: | |||||
s = [] | |||||
value = self.parser.callbacks[rule](s) | |||||
_action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name] | |||||
assert _action is Shift | |||||
state_stack.append(new_state) | |||||
value_stack.append(value) | |||||
if state_stack[-1] == end_state: | |||||
self.result = value_stack[-1] | |||||
return self.result | |||||
state = state_stack[-1] | |||||
try: | |||||
action, arg = self.parser.parse_table.states[state][token.type] | |||||
except KeyError as e: | |||||
raise ParseError(e) | |||||
assert arg != end_state | |||||
assert action is Shift | |||||
state_stack.append(arg) | |||||
value_stack.append(token) | |||||
def copy(self): | |||||
return self.parser_state.feed_token(token) | |||||
def __copy__(self): | |||||
"""Create a new puppet with a separate state. | """Create a new puppet with a separate state. | ||||
Calls to feed_token() won't affect the old puppet, and vice-versa. | Calls to feed_token() won't affect the old puppet, and vice-versa. | ||||
""" | """ | ||||
return type(self)( | return type(self)( | ||||
self.parser, | self.parser, | ||||
list(self._state_stack), | |||||
deepcopy(self._value_stack), | |||||
self._start, | |||||
self._stream, | |||||
self._set_state, | |||||
copy(self.parser_state), | |||||
copy(self.lexer_state), | |||||
) | ) | ||||
def __eq__(self, other): | def __eq__(self, other): | ||||
if not isinstance(other, ParserPuppet): | if not isinstance(other, ParserPuppet): | ||||
return False | return False | ||||
return ( | |||||
self._state_stack == other._state_stack and | |||||
self._value_stack == other._value_stack and | |||||
self._stream == other._stream and | |||||
self._start == other._start | |||||
) | |||||
return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state | |||||
def __hash__(self): | |||||
return hash((tuple(self._state_stack), self._start)) | |||||
# TODO Provide with an immutable puppet instance | |||||
# def __hash__(self): | |||||
# return hash((self.parser_state, self.lexer_state)) | |||||
def pretty(self): | def pretty(self): | ||||
"""Print the output of ``choices()`` in a way that's easier to read.""" | """Print the output of ``choices()`` in a way that's easier to read.""" | ||||
out = ["Puppet choices:"] | out = ["Puppet choices:"] | ||||
for k, v in self.choices().items(): | for k, v in self.choices().items(): | ||||
out.append('\t- %s -> %s' % (k, v)) | out.append('\t- %s -> %s' % (k, v)) | ||||
out.append('stack size: %s' % len(self._state_stack)) | |||||
out.append('stack size: %s' % len(self.parser_state.state_stack)) | |||||
return '\n'.join(out) | return '\n'.join(out) | ||||
def choices(self): | def choices(self): | ||||
@@ -111,16 +60,16 @@ class ParserPuppet(object): | |||||
Updated by ``feed_token()``. | Updated by ``feed_token()``. | ||||
""" | """ | ||||
return self.parser.parse_table.states[self._state_stack[-1]] | |||||
return self.parser_state.parse_table.states[self.parser_state.position] | |||||
def accepts(self): | def accepts(self): | ||||
accepts = set() | accepts = set() | ||||
for t in self.choices(): | for t in self.choices(): | ||||
if t.isupper(): # is terminal? | if t.isupper(): # is terminal? | ||||
new_puppet = self.copy() | |||||
new_puppet = copy(self) | |||||
try: | try: | ||||
new_puppet.feed_token(Token(t, '')) | new_puppet.feed_token(Token(t, '')) | ||||
except ParseError: | |||||
except UnexpectedToken: | |||||
pass | pass | ||||
else: | else: | ||||
accepts.add(t) | accepts.add(t) | ||||
@@ -128,7 +77,4 @@ class ParserPuppet(object): | |||||
def resume_parse(self): | def resume_parse(self): | ||||
"""Resume parsing from the current puppet state.""" | """Resume parsing from the current puppet state.""" | ||||
return self.parser.parse( | |||||
self._stream, self._start, self._set_state, | |||||
self._value_stack, self._state_stack | |||||
) | |||||
return self.parser.parse_from_state(self.parser_state) |
@@ -2217,6 +2217,42 @@ def _make_parser_test(LEXER, PARSER): | |||||
""", regex=True) | """, regex=True) | ||||
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | ||||
@unittest.skipIf(PARSER!='lalr', "Puppet error handling only works with LALR for now") | |||||
def test_error_with_puppet(self): | |||||
def ignore_errors(e): | |||||
if isinstance(e, UnexpectedCharacters): | |||||
# Skip bad character | |||||
return True | |||||
# Must be UnexpectedToken | |||||
if e.token.type == 'COMMA': | |||||
# Skip comma | |||||
return True | |||||
elif e.token.type == 'SIGNED_NUMBER': | |||||
# Try to feed a comma and retry the number | |||||
e.puppet.feed_token(Token('COMMA', ',')) | |||||
e.puppet.feed_token(e.token) | |||||
return True | |||||
# Unhandled error. Will stop parse and raise exception | |||||
return False | |||||
g = _Lark(r''' | |||||
start: "[" num ("," num)* "]" | |||||
?num: SIGNED_NUMBER | |||||
%import common.SIGNED_NUMBER | |||||
%ignore " " | |||||
''') | |||||
s = "[0 1, 2,, 3,,, 4, 5 6 ]" | |||||
tree = g.parse(s, on_error=ignore_errors) | |||||
res = [int(x) for x in tree.children] | |||||
assert res == list(range(7)) | |||||
s = "[0 1, 2,@, 3,,, 4, 5 6 ]$" | |||||
tree = g.parse(s, on_error=ignore_errors) | |||||
_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | ||||
_TestParser.__name__ = _NAME | _TestParser.__name__ = _NAME | ||||
_TestParser.__qualname__ = "tests.test_parser." + _NAME | _TestParser.__qualname__ = "tests.test_parser." + _NAME | ||||