diff --git a/lark/lark.py b/lark/lark.py index 05ad9b1..bc34eb4 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,4 +1,5 @@ from __future__ import absolute_import +from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken import sys, os, pickle, hashlib from io import open @@ -9,7 +10,7 @@ from .load_grammar import load_grammar from .tree import Tree from .common import LexerConf, ParserConf -from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken +from .lexer import Lexer, TraditionalLexer, TerminalDef from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend, _get_lexer_callbacks from .grammar import Rule @@ -462,20 +463,32 @@ class Lark(Serialize): try: return self.parser.parse(text, start=start) - except UnexpectedToken as e: + except UnexpectedInput as e: if on_error is None: raise while True: + if isinstance(e, UnexpectedCharacters): + s = e.puppet.lexer_state.state + p = s.line_ctr.char_pos + if not on_error(e): raise e + + if isinstance(e, UnexpectedCharacters): + # If user didn't change the character position, then we should + if p == s.line_ctr.char_pos: + s.line_ctr.feed(s.text[p:p+1]) + try: return e.puppet.resume_parse() except UnexpectedToken as e2: - if e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: + if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: # Prevent infinite loop raise e2 e = e2 + except UnexpectedCharacters as e2: + e = e2 ###} diff --git a/lark/lexer.py b/lark/lexer.py index 8fc9e4b..b080921 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -2,7 +2,7 @@ import re -from .utils import Str, classify, get_regexp_width, Py36, Serialize +from .utils import Str, classify, get_regexp_width, Py36, Serialize, suppress from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone @@ -157,6 +157,8 @@ class Token(Str): class LineCounter: + __slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char' + def __init__(self, newline_char): self.newline_char = newline_char self.char_pos = 0 @@ -167,7 +169,7 @@ class LineCounter: def feed(self, token, test_newline=True): """Consume a token and calculate the new line & column. - As an optional optimization, set test_newline=False is token doesn't contain a newline. + As an optional optimization, set test_newline=False if token doesn't contain a newline. """ if test_newline: newlines = token.count(self.newline_char) @@ -178,49 +180,6 @@ class LineCounter: self.char_pos += len(token) self.column = self.char_pos - self.line_start_pos + 1 -class _Lex: - "Built to serve both Lexer and ContextualLexer" - def __init__(self, lexer, state=None): - self.lexer = lexer - self.state = state - - def lex(self, stream, newline_types, ignore_types): - newline_types = frozenset(newline_types) - ignore_types = frozenset(ignore_types) - line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n') - last_token = None - - while line_ctr.char_pos < len(stream): - lexer = self.lexer - res = lexer.match(stream, line_ctr.char_pos) - if not res: - allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types - if not allowed: - allowed = {""} - raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) - - value, type_ = res - - if type_ not in ignore_types: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - line_ctr.feed(value, type_ in newline_types) - t.end_line = line_ctr.line - t.end_column = line_ctr.column - t.end_pos = line_ctr.char_pos - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - if not isinstance(t, Token): - raise ValueError("Callbacks must return a token (returned %r)" % t) - yield t - last_token = t - else: - if type_ in lexer.callback: - t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - lexer.callback[type_](t2) - line_ctr.feed(value, type_ in newline_types) - - - class UnlessCallback: def __init__(self, mres): @@ -286,7 +245,6 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes) except AssertionError: # Yes, this is what Python provides us.. :/ return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) - # terms_from_name = {t.name: t for t in terminals[:max_size]} mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) terminals = terminals[max_size:] return mres @@ -308,10 +266,14 @@ class Lexer(object): """Lexer interface Method Signatures: - lex(self, stream) -> Iterator[Token] + lex(self, text) -> Iterator[Token] """ lex = NotImplemented + def make_lexer_state(self, text): + line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n') + return LexerState(text, line_ctr) + class TraditionalLexer(Lexer): @@ -335,8 +297,8 @@ class TraditionalLexer(Lexer): assert set(conf.ignore) <= {t.name for t in terminals} # Init - self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] - self.ignore_types = list(conf.ignore) + self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())) + self.ignore_types = frozenset(conf.ignore) terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) self.terminals = terminals @@ -345,7 +307,6 @@ class TraditionalLexer(Lexer): self.use_bytes = conf.use_bytes self._mres = None - # self.build(g_regex_flags) def _build(self): terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) @@ -366,17 +327,61 @@ class TraditionalLexer(Lexer): self._build() return self._mres - def match(self, stream, pos): + def match(self, text, pos): for mre, type_from_index in self.mres: - m = mre.match(stream, pos) + m = mre.match(text, pos) if m: return m.group(0), type_from_index[m.lastindex] - def lex(self, stream): - return _Lex(self).lex(stream, self.newline_types, self.ignore_types) + def lex(self, state, parser_state): + with suppress(EOFError): + while True: + yield self.next_token(state) + + def next_token(self, lex_state): + line_ctr = lex_state.line_ctr + while line_ctr.char_pos < len(lex_state.text): + res = self.match(lex_state.text, line_ctr.char_pos) + if not res: + allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types + if not allowed: + allowed = {""} + raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, + allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) + + value, type_ = res + + if type_ not in self.ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + line_ctr.feed(value, type_ in self.newline_types) + t.end_line = line_ctr.line + t.end_column = line_ctr.column + t.end_pos = line_ctr.char_pos + if t.type in self.callback: + t = self.callback[t.type](t) + if not isinstance(t, Token): + raise ValueError("Callbacks must return a token (returned %r)" % t) + lex_state.last_token = t + return t + else: + if type_ in self.callback: + t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + self.callback[type_](t2) + line_ctr.feed(value, type_ in self.newline_types) + + # EOF + raise EOFError(self) +class LexerState: + __slots__ = 'text', 'line_ctr', 'last_token' + def __init__(self, text, line_ctr, last_token=None): + self.text = text + self.line_ctr = line_ctr + self.last_token = last_token + def __copy__(self): + return type(self)(self.text, copy(self.line_ctr), self.last_token) class ContextualLexer(Lexer): @@ -409,25 +414,29 @@ class ContextualLexer(Lexer): assert trad_conf.tokens is terminals self.root_lexer = TraditionalLexer(trad_conf) - def lex(self, stream, get_parser_state): - parser_state = get_parser_state() - l = _Lex(self.lexers[parser_state], parser_state) + def make_lexer_state(self, text): + return self.root_lexer.make_lexer_state(text) + + def lex(self, lexer_state, parser_state): try: - for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): - yield x - parser_state = get_parser_state() - l.lexer = self.lexers[parser_state] - l.state = parser_state # For debug only, no need to worry about multithreading + while True: + lexer = self.lexers[parser_state.position] + yield lexer.next_token(lexer_state) + except EOFError: + pass except UnexpectedCharacters as e: - # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, - # but not in the current context. + # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. # This tests the input against the global context, to provide a nicer error. - root_match = self.root_lexer.match(stream, e.pos_in_stream) - if not root_match: - raise + token = self.root_lexer.next_token(lexer_state) + raise UnexpectedToken(token, e.allowed, state=parser_state.position) - value, type_ = root_match - t = Token(type_, value, e.pos_in_stream, e.line, e.column) - raise UnexpectedToken(t, e.allowed, state=e.state) +class LexerThread: + "A thread that ties a lexer instance and a lexer state, to be used by the parser" + + def __init__(self, lexer, text): + self.lexer = lexer + self.state = lexer.make_lexer_state(text) + def lex(self, parser_state): + return self.lexer.lex(self.state, parser_state) ###} diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 926603c..202382b 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,6 +1,6 @@ from .utils import get_regexp_width, Serialize from .parsers.grammar_analysis import GrammarAnalyzer -from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef +from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser from .grammar import Rule @@ -23,12 +23,22 @@ def get_frontend(parser, lexer): elif lexer == 'contextual': return LALR_ContextualLexer elif issubclass(lexer, Lexer): + class CustomLexerWrapper(Lexer): + def __init__(self, lexer_conf): + self.lexer = lexer(lexer_conf) + def lex(self, lexer_state, parser_state): + return self.lexer.lex(lexer_state.text) + class LALR_CustomLexerWrapper(LALR_CustomLexer): def __init__(self, lexer_conf, parser_conf, options=None): super(LALR_CustomLexerWrapper, self).__init__( lexer, lexer_conf, parser_conf, options=options) def init_lexer(self): - self.lexer = lexer(self.lexer_conf) + future_interface = getattr(lexer, '__future_interface__', False) + if future_interface: + self.lexer = lexer(self.lexer_conf) + else: + self.lexer = CustomLexerWrapper(self.lexer_conf) return LALR_CustomLexerWrapper else: @@ -54,7 +64,7 @@ def get_frontend(parser, lexer): class _ParserFrontend(Serialize): - def _parse(self, input, start, *args): + def _parse(self, start, input, *args): if start is None: start = self.start if len(start) > 1: @@ -71,6 +81,18 @@ def _get_lexer_callbacks(transformer, terminals): result[terminal.name] = callback return result +class PostLexConnector: + def __init__(self, lexer, postlexer): + self.lexer = lexer + self.postlexer = postlexer + + def make_lexer_state(self, text): + return self.lexer.make_lexer_state(text) + + def lex(self, lexer_state, parser_state): + i = self.lexer.lex(lexer_state, parser_state) + return self.postlexer.process(i) + class WithLexer(_ParserFrontend): lexer = None @@ -106,13 +128,14 @@ class WithLexer(_ParserFrontend): def _serialize(self, data, memo): data['parser'] = data['parser'].serialize(memo) - def lex(self, *args): - stream = self.lexer.lex(*args) - return self.postlex.process(stream) if self.postlex else stream + def make_lexer(self, text): + lexer = self.lexer + if self.postlex: + lexer = PostLexConnector(self.lexer, self.postlex) + return LexerThread(lexer, text) def parse(self, text, start=None): - token_stream = self.lex(text) - return self._parse(token_stream, start) + return self._parse(start, self.make_lexer(text)) def init_traditional_lexer(self): self.lexer = TraditionalLexer(self.lexer_conf) @@ -138,14 +161,6 @@ class LALR_ContextualLexer(LALR_WithLexer): always_accept = self.postlex.always_accept if self.postlex else () self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) - - def parse(self, text, start=None): - parser_state = [None] - def set_parser_state(s): - parser_state[0] = s - - token_stream = self.lex(text, lambda: parser_state[0]) - return self._parse(token_stream, start, set_parser_state) ###} class LALR_CustomLexer(LALR_WithLexer): @@ -156,15 +171,6 @@ class LALR_CustomLexer(LALR_WithLexer): WithLexer.__init__(self, lexer_conf, parser_conf, options) -def tokenize_text(text): - line = 1 - col_start_pos = 0 - for i, ch in enumerate(text): - if '\n' in ch: - line += ch.count('\n') - col_start_pos = i + ch.rindex('\n') - yield Token('CHAR', ch, line=line, column=i - col_start_pos) - class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): WithLexer.__init__(self, lexer_conf, parser_conf, options) @@ -175,6 +181,9 @@ class Earley(WithLexer): tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) + def make_lexer(self, text): + return WithLexer.make_lexer(self, text).lex(None) + def match(self, term, token): return term.name == token.type @@ -219,7 +228,7 @@ class XEarley(_ParserFrontend): self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) def parse(self, text, start): - return self._parse(text, start) + return self._parse(start, text) class XEarley_CompleteLex(XEarley): def __init__(self, *args, **kw): @@ -239,8 +248,8 @@ class CYK(WithLexer): self.callbacks = parser_conf.callbacks def parse(self, text, start): - tokens = list(self.lex(text)) - parse = self._parse(tokens, start) + tokens = list(self.make_lexer(text).lex(None)) + parse = self._parse(start, tokens) parse = self._transform(parse) return parse diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 1c98f3a..4fa911f 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -2,9 +2,9 @@ """ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..exceptions import UnexpectedToken +from copy import deepcopy, copy +from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken from ..lexer import Token -from ..utils import Enumerator, Serialize from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable from .lalr_puppet import ParserPuppet @@ -35,84 +35,123 @@ class LALR_Parser(object): return self.parser.parse(*args) -class _Parser: - def __init__(self, parse_table, callbacks, debug=False): +class ParseConf: + __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' + + def __init__(self, parse_table, callbacks, start): self.parse_table = parse_table + + self.start_state = self.parse_table.start_states[start] + self.end_state = self.parse_table.end_states[start] + self.states = self.parse_table.states + self.callbacks = callbacks - self.debug = debug + self.start = start - def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None): - token = None - stream = iter(seq) - states = self.parse_table.states - start_state = self.parse_table.start_states[start] - end_state = self.parse_table.end_states[start] - state_stack = state_stack or [start_state] - value_stack = value_stack or [] +class ParserState: + __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' - if set_state: set_state(start_state) + def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): + self.parse_conf = parse_conf + self.lexer = lexer + self.state_stack = state_stack or [self.parse_conf.start_state] + self.value_stack = value_stack or [] - def get_action(token): + @property + def position(self): + return self.state_stack[-1] + + def __copy__(self): + return type(self)( + self.parse_conf, + self.lexer, # XXX copy + copy(self.state_stack), + deepcopy(self.value_stack), + ) + + def copy(self): + return copy(self) + + def feed_token(self, token, is_end=False): + state_stack = self.state_stack + value_stack = self.value_stack + states = self.parse_conf.states + end_state = self.parse_conf.end_state + callbacks = self.parse_conf.callbacks + + while True: state = state_stack[-1] try: - return states[state][token.type] + action, arg = states[state][token.type] except KeyError: expected = {s for s in states[state].keys() if s.isupper()} - try: - puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) - except NameError: # For standalone parser - puppet = None - raise UnexpectedToken(token, expected, state=state, puppet=puppet) - - def reduce(rule): - size = len(rule.expansion) - if size: - s = value_stack[-size:] - del state_stack[-size:] - del value_stack[-size:] + raise UnexpectedToken(token, expected, state=state, puppet=None) + + assert arg != end_state + + if action is Shift: + # shift once and return + assert not is_end + state_stack.append(arg) + value_stack.append(token) + return arg else: - s = [] + # reduce+shift as many times as necessary + rule = arg + size = len(rule.expansion) + if size: + s = value_stack[-size:] + del state_stack[-size:] + del value_stack[-size:] + else: + s = [] + + value = callbacks[rule](s) + + _action, new_state = states[state_stack[-1]][rule.origin.name] + assert _action is Shift + state_stack.append(new_state) + value_stack.append(value) + + if is_end and state_stack[-1] == end_state: + return value_stack[-1] - value = self.callbacks[rule](s) +class _Parser: + def __init__(self, parse_table, callbacks, debug=False): + self.parse_table = parse_table + self.callbacks = callbacks + self.debug = debug - _action, new_state = states[state_stack[-1]][rule.origin.name] - assert _action is Shift - state_stack.append(new_state) - value_stack.append(value) + def parse(self, lexer, start, value_stack=None, state_stack=None): + parse_conf = ParseConf(self.parse_table, self.callbacks, start) + parser_state = ParserState(parse_conf, lexer, state_stack, value_stack) + return self.parse_from_state(parser_state) + def parse_from_state(self, state): # Main LALR-parser loop try: - for token in stream: - while True: - action, arg = get_action(token) - assert arg != end_state - - if action is Shift: - state_stack.append(arg) - value_stack.append(token) - if set_state: set_state(arg) - break # next token - else: - reduce(arg) + token = None + for token in state.lexer.lex(state): + state.feed_token(token) + + token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) + return state.feed_token(token, True) + except UnexpectedInput as e: + try: + e.puppet = ParserPuppet(self, state, state.lexer) + except NameError: + pass + raise e except Exception as e: if self.debug: print("") print("STATE STACK DUMP") print("----------------") - for i, s in enumerate(state_stack): + for i, s in enumerate(state.state_stack): print('%d)' % i , s) print("") raise - - token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) - while True: - _action, arg = get_action(token) - assert(_action is Reduce) - reduce(arg) - if state_stack[-1] == end_state: - return value_stack[-1] - ###} diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 37a1e9d..95ee3a3 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -1,10 +1,10 @@ # This module provide a LALR puppet, which is used to debugging and error handling -from copy import deepcopy +from copy import copy from .lalr_analysis import Shift, Reduce from .. import Token -from ..exceptions import ParseError +from ..exceptions import UnexpectedToken class ParserPuppet(object): @@ -12,96 +12,45 @@ class ParserPuppet(object): For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``. """ - def __init__(self, parser, state_stack, value_stack, start, stream, set_state): + def __init__(self, parser, parser_state, lexer_state): self.parser = parser - self._state_stack = state_stack - self._value_stack = value_stack - self._start = start - self._stream = stream - self._set_state = set_state - - self.result = None + self.parser_state = parser_state + self.lexer_state = lexer_state def feed_token(self, token): """Feed the parser with a token, and advance it to the next state, as if it received it from the lexer. Note that ``token`` has to be an instance of ``Token``. """ - end_state = self.parser.parse_table.end_states[self._start] - state_stack = self._state_stack - value_stack = self._value_stack - - state = state_stack[-1] - action, arg = self.parser.parse_table.states[state][token.type] - if arg == end_state: - raise ParseError(arg) - - while action is Reduce: - rule = arg - size = len(rule.expansion) - if size: - s = value_stack[-size:] - del state_stack[-size:] - del value_stack[-size:] - else: - s = [] - - value = self.parser.callbacks[rule](s) - - _action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name] - assert _action is Shift - state_stack.append(new_state) - value_stack.append(value) - - if state_stack[-1] == end_state: - self.result = value_stack[-1] - return self.result - - state = state_stack[-1] - try: - action, arg = self.parser.parse_table.states[state][token.type] - except KeyError as e: - raise ParseError(e) - assert arg != end_state - - assert action is Shift - state_stack.append(arg) - value_stack.append(token) - - def copy(self): + return self.parser_state.feed_token(token) + + def __copy__(self): """Create a new puppet with a separate state. Calls to feed_token() won't affect the old puppet, and vice-versa. """ return type(self)( self.parser, - list(self._state_stack), - deepcopy(self._value_stack), - self._start, - self._stream, - self._set_state, + copy(self.parser_state), + copy(self.lexer_state), ) def __eq__(self, other): if not isinstance(other, ParserPuppet): return False - return ( - self._state_stack == other._state_stack and - self._value_stack == other._value_stack and - self._stream == other._stream and - self._start == other._start - ) + return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state - def __hash__(self): - return hash((tuple(self._state_stack), self._start)) + # TODO Provide with an immutable puppet instance + # def __hash__(self): + # return hash((self.parser_state, self.lexer_state)) def pretty(self): """Print the output of ``choices()`` in a way that's easier to read.""" out = ["Puppet choices:"] for k, v in self.choices().items(): out.append('\t- %s -> %s' % (k, v)) - out.append('stack size: %s' % len(self._state_stack)) + out.append('stack size: %s' % len(self.parser_state.state_stack)) return '\n'.join(out) def choices(self): @@ -111,16 +60,16 @@ class ParserPuppet(object): Updated by ``feed_token()``. """ - return self.parser.parse_table.states[self._state_stack[-1]] + return self.parser_state.parse_table.states[self.parser_state.position] def accepts(self): accepts = set() for t in self.choices(): if t.isupper(): # is terminal? - new_puppet = self.copy() + new_puppet = copy(self) try: new_puppet.feed_token(Token(t, '')) - except ParseError: + except UnexpectedToken: pass else: accepts.add(t) @@ -128,7 +77,4 @@ class ParserPuppet(object): def resume_parse(self): """Resume parsing from the current puppet state.""" - return self.parser.parse( - self._stream, self._start, self._set_state, - self._value_stack, self._state_stack - ) + return self.parser.parse_from_state(self.parser_state) diff --git a/tests/test_parser.py b/tests/test_parser.py index 49e661e..38399cf 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2217,6 +2217,42 @@ def _make_parser_test(LEXER, PARSER): """, regex=True) self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') + + @unittest.skipIf(PARSER!='lalr', "Puppet error handling only works with LALR for now") + def test_error_with_puppet(self): + def ignore_errors(e): + if isinstance(e, UnexpectedCharacters): + # Skip bad character + return True + + # Must be UnexpectedToken + if e.token.type == 'COMMA': + # Skip comma + return True + elif e.token.type == 'SIGNED_NUMBER': + # Try to feed a comma and retry the number + e.puppet.feed_token(Token('COMMA', ',')) + e.puppet.feed_token(e.token) + return True + + # Unhandled error. Will stop parse and raise exception + return False + + g = _Lark(r''' + start: "[" num ("," num)* "]" + ?num: SIGNED_NUMBER + %import common.SIGNED_NUMBER + %ignore " " + ''') + s = "[0 1, 2,, 3,,, 4, 5 6 ]" + tree = g.parse(s, on_error=ignore_errors) + res = [int(x) for x in tree.children] + assert res == list(range(7)) + + s = "[0 1, 2,@, 3,,, 4, 5 6 ]$" + tree = g.parse(s, on_error=ignore_errors) + + _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _TestParser.__name__ = _NAME _TestParser.__qualname__ = "tests.test_parser." + _NAME