Ver código fonte

Merge pull request #725 from lark-parser/resumable3

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.0
Erez Shinan 3 anos atrás
committed by GitHub
pai
commit
6d6e22048d
Nenhuma chave conhecida encontrada para esta assinatura no banco de dados ID da chave GPG: 4AEE18F83AFDEB23
6 arquivos alterados com 282 adições e 230 exclusões
  1. +16
    -3
      lark/lark.py
  2. +79
    -70
      lark/lexer.py
  3. +37
    -28
      lark/parser_frontends.py
  4. +95
    -56
      lark/parsers/lalr_parser.py
  5. +19
    -73
      lark/parsers/lalr_puppet.py
  6. +36
    -0
      tests/test_parser.py

+ 16
- 3
lark/lark.py Ver arquivo

@@ -1,4 +1,5 @@
from __future__ import absolute_import
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken

import sys, os, pickle, hashlib
from io import open
@@ -9,7 +10,7 @@ from .load_grammar import load_grammar
from .tree import Tree
from .common import LexerConf, ParserConf

from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken
from .lexer import Lexer, TraditionalLexer, TerminalDef
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend, _get_lexer_callbacks
from .grammar import Rule
@@ -462,20 +463,32 @@ class Lark(Serialize):

try:
return self.parser.parse(text, start=start)
except UnexpectedToken as e:
except UnexpectedInput as e:
if on_error is None:
raise

while True:
if isinstance(e, UnexpectedCharacters):
s = e.puppet.lexer_state.state
p = s.line_ctr.char_pos

if not on_error(e):
raise e

if isinstance(e, UnexpectedCharacters):
# If user didn't change the character position, then we should
if p == s.line_ctr.char_pos:
s.line_ctr.feed(s.text[p:p+1])

try:
return e.puppet.resume_parse()
except UnexpectedToken as e2:
if e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
# Prevent infinite loop
raise e2
e = e2
except UnexpectedCharacters as e2:
e = e2


###}

+ 79
- 70
lark/lexer.py Ver arquivo

@@ -2,7 +2,7 @@

import re

from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .utils import Str, classify, get_regexp_width, Py36, Serialize, suppress
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken

###{standalone
@@ -157,6 +157,8 @@ class Token(Str):


class LineCounter:
__slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char'

def __init__(self, newline_char):
self.newline_char = newline_char
self.char_pos = 0
@@ -167,7 +169,7 @@ class LineCounter:
def feed(self, token, test_newline=True):
"""Consume a token and calculate the new line & column.

As an optional optimization, set test_newline=False is token doesn't contain a newline.
As an optional optimization, set test_newline=False if token doesn't contain a newline.
"""
if test_newline:
newlines = token.count(self.newline_char)
@@ -178,49 +180,6 @@ class LineCounter:
self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos + 1

class _Lex:
"Built to serve both Lexer and ContextualLexer"
def __init__(self, lexer, state=None):
self.lexer = lexer
self.state = state

def lex(self, stream, newline_types, ignore_types):
newline_types = frozenset(newline_types)
ignore_types = frozenset(ignore_types)
line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n')
last_token = None

while line_ctr.char_pos < len(stream):
lexer = self.lexer
res = lexer.match(stream, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token])

value, type_ = res

if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
line_ctr.feed(value, type_ in newline_types)
t.end_line = line_ctr.line
t.end_column = line_ctr.column
t.end_pos = line_ctr.char_pos
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
yield t
last_token = t
else:
if type_ in lexer.callback:
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
lexer.callback[type_](t2)
line_ctr.feed(value, type_ in newline_types)




class UnlessCallback:
def __init__(self, mres):
@@ -286,7 +245,6 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes)
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)

# terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
terminals = terminals[max_size:]
return mres
@@ -308,10 +266,14 @@ class Lexer(object):
"""Lexer interface

Method Signatures:
lex(self, stream) -> Iterator[Token]
lex(self, text) -> Iterator[Token]
"""
lex = NotImplemented

def make_lexer_state(self, text):
line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n')
return LexerState(text, line_ctr)


class TraditionalLexer(Lexer):

@@ -335,8 +297,8 @@ class TraditionalLexer(Lexer):
assert set(conf.ignore) <= {t.name for t in terminals}

# Init
self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = list(conf.ignore)
self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp()))
self.ignore_types = frozenset(conf.ignore)

terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
self.terminals = terminals
@@ -345,7 +307,6 @@ class TraditionalLexer(Lexer):
self.use_bytes = conf.use_bytes

self._mres = None
# self.build(g_regex_flags)

def _build(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes)
@@ -366,17 +327,61 @@ class TraditionalLexer(Lexer):
self._build()
return self._mres

def match(self, stream, pos):
def match(self, text, pos):
for mre, type_from_index in self.mres:
m = mre.match(stream, pos)
m = mre.match(text, pos)
if m:
return m.group(0), type_from_index[m.lastindex]

def lex(self, stream):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
def lex(self, state, parser_state):
with suppress(EOFError):
while True:
yield self.next_token(state)

def next_token(self, lex_state):
line_ctr = lex_state.line_ctr
while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token])

value, type_ = res

if type_ not in self.ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
line_ctr.feed(value, type_ in self.newline_types)
t.end_line = line_ctr.line
t.end_column = line_ctr.column
t.end_pos = line_ctr.char_pos
if t.type in self.callback:
t = self.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
lex_state.last_token = t
return t
else:
if type_ in self.callback:
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
self.callback[type_](t2)
line_ctr.feed(value, type_ in self.newline_types)

# EOF
raise EOFError(self)

class LexerState:
__slots__ = 'text', 'line_ctr', 'last_token'

def __init__(self, text, line_ctr, last_token=None):
self.text = text
self.line_ctr = line_ctr
self.last_token = last_token

def __copy__(self):
return type(self)(self.text, copy(self.line_ctr), self.last_token)

class ContextualLexer(Lexer):

@@ -409,25 +414,29 @@ class ContextualLexer(Lexer):
assert trad_conf.tokens is terminals
self.root_lexer = TraditionalLexer(trad_conf)

def lex(self, stream, get_parser_state):
parser_state = get_parser_state()
l = _Lex(self.lexers[parser_state], parser_state)
def make_lexer_state(self, text):
return self.root_lexer.make_lexer_state(text)

def lex(self, lexer_state, parser_state):
try:
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
parser_state = get_parser_state()
l.lexer = self.lexers[parser_state]
l.state = parser_state # For debug only, no need to worry about multithreading
while True:
lexer = self.lexers[parser_state.position]
yield lexer.next_token(lexer_state)
except EOFError:
pass
except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
# but not in the current context.
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
# This tests the input against the global context, to provide a nicer error.
root_match = self.root_lexer.match(stream, e.pos_in_stream)
if not root_match:
raise
token = self.root_lexer.next_token(lexer_state)
raise UnexpectedToken(token, e.allowed, state=parser_state.position)

value, type_ = root_match
t = Token(type_, value, e.pos_in_stream, e.line, e.column)
raise UnexpectedToken(t, e.allowed, state=e.state)
class LexerThread:
"A thread that ties a lexer instance and a lexer state, to be used by the parser"

def __init__(self, lexer, text):
self.lexer = lexer
self.state = lexer.make_lexer_state(text)

def lex(self, parser_state):
return self.lexer.lex(self.state, parser_state)
###}

+ 37
- 28
lark/parser_frontends.py Ver arquivo

@@ -1,6 +1,6 @@
from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule
@@ -23,12 +23,22 @@ def get_frontend(parser, lexer):
elif lexer == 'contextual':
return LALR_ContextualLexer
elif issubclass(lexer, Lexer):
class CustomLexerWrapper(Lexer):
def __init__(self, lexer_conf):
self.lexer = lexer(lexer_conf)
def lex(self, lexer_state, parser_state):
return self.lexer.lex(lexer_state.text)

class LALR_CustomLexerWrapper(LALR_CustomLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
super(LALR_CustomLexerWrapper, self).__init__(
lexer, lexer_conf, parser_conf, options=options)
def init_lexer(self):
self.lexer = lexer(self.lexer_conf)
future_interface = getattr(lexer, '__future_interface__', False)
if future_interface:
self.lexer = lexer(self.lexer_conf)
else:
self.lexer = CustomLexerWrapper(self.lexer_conf)

return LALR_CustomLexerWrapper
else:
@@ -54,7 +64,7 @@ def get_frontend(parser, lexer):


class _ParserFrontend(Serialize):
def _parse(self, input, start, *args):
def _parse(self, start, input, *args):
if start is None:
start = self.start
if len(start) > 1:
@@ -71,6 +81,18 @@ def _get_lexer_callbacks(transformer, terminals):
result[terminal.name] = callback
return result

class PostLexConnector:
def __init__(self, lexer, postlexer):
self.lexer = lexer
self.postlexer = postlexer

def make_lexer_state(self, text):
return self.lexer.make_lexer_state(text)

def lex(self, lexer_state, parser_state):
i = self.lexer.lex(lexer_state, parser_state)
return self.postlexer.process(i)


class WithLexer(_ParserFrontend):
lexer = None
@@ -106,13 +128,14 @@ class WithLexer(_ParserFrontend):
def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo)

def lex(self, *args):
stream = self.lexer.lex(*args)
return self.postlex.process(stream) if self.postlex else stream
def make_lexer(self, text):
lexer = self.lexer
if self.postlex:
lexer = PostLexConnector(self.lexer, self.postlex)
return LexerThread(lexer, text)

def parse(self, text, start=None):
token_stream = self.lex(text)
return self._parse(token_stream, start)
return self._parse(start, self.make_lexer(text))

def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf)
@@ -138,14 +161,6 @@ class LALR_ContextualLexer(LALR_WithLexer):
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)


def parse(self, text, start=None):
parser_state = [None]
def set_parser_state(s):
parser_state[0] = s

token_stream = self.lex(text, lambda: parser_state[0])
return self._parse(token_stream, start, set_parser_state)
###}

class LALR_CustomLexer(LALR_WithLexer):
@@ -156,15 +171,6 @@ class LALR_CustomLexer(LALR_WithLexer):
WithLexer.__init__(self, lexer_conf, parser_conf, options)


def tokenize_text(text):
line = 1
col_start_pos = 0
for i, ch in enumerate(text):
if '\n' in ch:
line += ch.count('\n')
col_start_pos = i + ch.rindex('\n')
yield Token('CHAR', ch, line=line, column=i - col_start_pos)

class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
@@ -175,6 +181,9 @@ class Earley(WithLexer):
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class)

def make_lexer(self, text):
return WithLexer.make_lexer(self, text).lex(None)

def match(self, term, token):
return term.name == token.type

@@ -219,7 +228,7 @@ class XEarley(_ParserFrontend):
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)

def parse(self, text, start):
return self._parse(text, start)
return self._parse(start, text)

class XEarley_CompleteLex(XEarley):
def __init__(self, *args, **kw):
@@ -239,8 +248,8 @@ class CYK(WithLexer):
self.callbacks = parser_conf.callbacks

def parse(self, text, start):
tokens = list(self.lex(text))
parse = self._parse(tokens, start)
tokens = list(self.make_lexer(text).lex(None))
parse = self._parse(start, tokens)
parse = self._transform(parse)
return parse



+ 95
- 56
lark/parsers/lalr_parser.py Ver arquivo

@@ -2,9 +2,9 @@
"""
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
from ..exceptions import UnexpectedToken
from copy import deepcopy, copy
from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
from ..lexer import Token
from ..utils import Enumerator, Serialize

from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_puppet import ParserPuppet
@@ -35,84 +35,123 @@ class LALR_Parser(object):
return self.parser.parse(*args)


class _Parser:
def __init__(self, parse_table, callbacks, debug=False):
class ParseConf:
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'

def __init__(self, parse_table, callbacks, start):
self.parse_table = parse_table

self.start_state = self.parse_table.start_states[start]
self.end_state = self.parse_table.end_states[start]
self.states = self.parse_table.states

self.callbacks = callbacks
self.debug = debug
self.start = start

def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None):
token = None
stream = iter(seq)
states = self.parse_table.states
start_state = self.parse_table.start_states[start]
end_state = self.parse_table.end_states[start]

state_stack = state_stack or [start_state]
value_stack = value_stack or []
class ParserState:
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'

if set_state: set_state(start_state)
def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None):
self.parse_conf = parse_conf
self.lexer = lexer
self.state_stack = state_stack or [self.parse_conf.start_state]
self.value_stack = value_stack or []

def get_action(token):
@property
def position(self):
return self.state_stack[-1]

def __copy__(self):
return type(self)(
self.parse_conf,
self.lexer, # XXX copy
copy(self.state_stack),
deepcopy(self.value_stack),
)

def copy(self):
return copy(self)

def feed_token(self, token, is_end=False):
state_stack = self.state_stack
value_stack = self.value_stack
states = self.parse_conf.states
end_state = self.parse_conf.end_state
callbacks = self.parse_conf.callbacks

while True:
state = state_stack[-1]
try:
return states[state][token.type]
action, arg = states[state][token.type]
except KeyError:
expected = {s for s in states[state].keys() if s.isupper()}
try:
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
except NameError: # For standalone parser
puppet = None
raise UnexpectedToken(token, expected, state=state, puppet=puppet)

def reduce(rule):
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
raise UnexpectedToken(token, expected, state=state, puppet=None)

assert arg != end_state

if action is Shift:
# shift once and return
assert not is_end
state_stack.append(arg)
value_stack.append(token)
return arg
else:
s = []
# reduce+shift as many times as necessary
rule = arg
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []

value = callbacks[rule](s)

_action, new_state = states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)

if is_end and state_stack[-1] == end_state:
return value_stack[-1]

value = self.callbacks[rule](s)
class _Parser:
def __init__(self, parse_table, callbacks, debug=False):
self.parse_table = parse_table
self.callbacks = callbacks
self.debug = debug

_action, new_state = states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)
def parse(self, lexer, start, value_stack=None, state_stack=None):
parse_conf = ParseConf(self.parse_table, self.callbacks, start)
parser_state = ParserState(parse_conf, lexer, state_stack, value_stack)
return self.parse_from_state(parser_state)

def parse_from_state(self, state):
# Main LALR-parser loop
try:
for token in stream:
while True:
action, arg = get_action(token)
assert arg != end_state

if action is Shift:
state_stack.append(arg)
value_stack.append(token)
if set_state: set_state(arg)
break # next token
else:
reduce(arg)
token = None
for token in state.lexer.lex(state):
state.feed_token(token)
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(token, True)
except UnexpectedInput as e:
try:
e.puppet = ParserPuppet(self, state, state.lexer)
except NameError:
pass
raise e
except Exception as e:
if self.debug:
print("")
print("STATE STACK DUMP")
print("----------------")
for i, s in enumerate(state_stack):
for i, s in enumerate(state.state_stack):
print('%d)' % i , s)
print("")

raise

token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
while True:
_action, arg = get_action(token)
assert(_action is Reduce)
reduce(arg)
if state_stack[-1] == end_state:
return value_stack[-1]

###}


+ 19
- 73
lark/parsers/lalr_puppet.py Ver arquivo

@@ -1,10 +1,10 @@
# This module provide a LALR puppet, which is used to debugging and error handling

from copy import deepcopy
from copy import copy

from .lalr_analysis import Shift, Reduce
from .. import Token
from ..exceptions import ParseError
from ..exceptions import UnexpectedToken


class ParserPuppet(object):
@@ -12,96 +12,45 @@ class ParserPuppet(object):

For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``.
"""
def __init__(self, parser, state_stack, value_stack, start, stream, set_state):
def __init__(self, parser, parser_state, lexer_state):
self.parser = parser
self._state_stack = state_stack
self._value_stack = value_stack
self._start = start
self._stream = stream
self._set_state = set_state

self.result = None
self.parser_state = parser_state
self.lexer_state = lexer_state

def feed_token(self, token):
"""Feed the parser with a token, and advance it to the next state, as if it received it from the lexer.

Note that ``token`` has to be an instance of ``Token``.
"""
end_state = self.parser.parse_table.end_states[self._start]
state_stack = self._state_stack
value_stack = self._value_stack

state = state_stack[-1]
action, arg = self.parser.parse_table.states[state][token.type]
if arg == end_state:
raise ParseError(arg)

while action is Reduce:
rule = arg
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []

value = self.parser.callbacks[rule](s)

_action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)

if state_stack[-1] == end_state:
self.result = value_stack[-1]
return self.result

state = state_stack[-1]
try:
action, arg = self.parser.parse_table.states[state][token.type]
except KeyError as e:
raise ParseError(e)
assert arg != end_state

assert action is Shift
state_stack.append(arg)
value_stack.append(token)

def copy(self):
return self.parser_state.feed_token(token)

def __copy__(self):
"""Create a new puppet with a separate state.

Calls to feed_token() won't affect the old puppet, and vice-versa.
"""
return type(self)(
self.parser,
list(self._state_stack),
deepcopy(self._value_stack),
self._start,
self._stream,
self._set_state,
copy(self.parser_state),
copy(self.lexer_state),
)

def __eq__(self, other):
if not isinstance(other, ParserPuppet):
return False

return (
self._state_stack == other._state_stack and
self._value_stack == other._value_stack and
self._stream == other._stream and
self._start == other._start
)
return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state

def __hash__(self):
return hash((tuple(self._state_stack), self._start))
# TODO Provide with an immutable puppet instance
# def __hash__(self):
# return hash((self.parser_state, self.lexer_state))

def pretty(self):
"""Print the output of ``choices()`` in a way that's easier to read."""
out = ["Puppet choices:"]
for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v))
out.append('stack size: %s' % len(self._state_stack))
out.append('stack size: %s' % len(self.parser_state.state_stack))
return '\n'.join(out)

def choices(self):
@@ -111,16 +60,16 @@ class ParserPuppet(object):

Updated by ``feed_token()``.
"""
return self.parser.parse_table.states[self._state_stack[-1]]
return self.parser_state.parse_table.states[self.parser_state.position]

def accepts(self):
accepts = set()
for t in self.choices():
if t.isupper(): # is terminal?
new_puppet = self.copy()
new_puppet = copy(self)
try:
new_puppet.feed_token(Token(t, ''))
except ParseError:
except UnexpectedToken:
pass
else:
accepts.add(t)
@@ -128,7 +77,4 @@ class ParserPuppet(object):

def resume_parse(self):
"""Resume parsing from the current puppet state."""
return self.parser.parse(
self._stream, self._start, self._set_state,
self._value_stack, self._state_stack
)
return self.parser.parse_from_state(self.parser_state)

+ 36
- 0
tests/test_parser.py Ver arquivo

@@ -2217,6 +2217,42 @@ def _make_parser_test(LEXER, PARSER):
""", regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')


@unittest.skipIf(PARSER!='lalr', "Puppet error handling only works with LALR for now")
def test_error_with_puppet(self):
def ignore_errors(e):
if isinstance(e, UnexpectedCharacters):
# Skip bad character
return True

# Must be UnexpectedToken
if e.token.type == 'COMMA':
# Skip comma
return True
elif e.token.type == 'SIGNED_NUMBER':
# Try to feed a comma and retry the number
e.puppet.feed_token(Token('COMMA', ','))
e.puppet.feed_token(e.token)
return True

# Unhandled error. Will stop parse and raise exception
return False

g = _Lark(r'''
start: "[" num ("," num)* "]"
?num: SIGNED_NUMBER
%import common.SIGNED_NUMBER
%ignore " "
''')
s = "[0 1, 2,, 3,,, 4, 5 6 ]"
tree = g.parse(s, on_error=ignore_errors)
res = [int(x) for x in tree.children]
assert res == list(range(7))

s = "[0 1, 2,@, 3,,, 4, 5 6 ]$"
tree = g.parse(s, on_error=ignore_errors)


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME
_TestParser.__qualname__ = "tests.test_parser." + _NAME


Carregando…
Cancelar
Salvar