Browse Source

Completely redesigned the interface between Lexer<->Parser, and refactored LALR parser and lexers

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.0
Erez Sh 4 years ago
parent
commit
72e7926097
6 changed files with 215 additions and 185 deletions
  1. +6
    -3
      lark/lark.py
  2. +33
    -24
      lark/lexer.py
  3. +33
    -28
      lark/parser_frontends.py
  4. +90
    -58
      lark/parsers/lalr_parser.py
  5. +17
    -72
      lark/parsers/lalr_puppet.py
  6. +36
    -0
      tests/test_parser.py

+ 6
- 3
lark/lark.py View File

@@ -1,4 +1,5 @@
from __future__ import absolute_import from __future__ import absolute_import
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken


import sys, os, pickle, hashlib import sys, os, pickle, hashlib
from io import open from io import open
@@ -9,7 +10,7 @@ from .load_grammar import load_grammar
from .tree import Tree from .tree import Tree
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf


from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken
from .lexer import Lexer, TraditionalLexer, TerminalDef
from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend, _get_lexer_callbacks from .parser_frontends import get_frontend, _get_lexer_callbacks
from .grammar import Rule from .grammar import Rule
@@ -462,7 +463,7 @@ class Lark(Serialize):


try: try:
return self.parser.parse(text, start=start) return self.parser.parse(text, start=start)
except UnexpectedToken as e:
except UnexpectedInput as e:
if on_error is None: if on_error is None:
raise raise


@@ -472,10 +473,12 @@ class Lark(Serialize):
try: try:
return e.puppet.resume_parse() return e.puppet.resume_parse()
except UnexpectedToken as e2: except UnexpectedToken as e2:
if e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
# Prevent infinite loop # Prevent infinite loop
raise e2 raise e2
e = e2 e = e2
except UnexpectedCharacters as e2:
e = e2




###} ###}

+ 33
- 24
lark/lexer.py View File

@@ -157,6 +157,8 @@ class Token(Str):




class LineCounter: class LineCounter:
__slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char'

def __init__(self, newline_char): def __init__(self, newline_char):
self.newline_char = newline_char self.newline_char = newline_char
self.char_pos = 0 self.char_pos = 0
@@ -167,7 +169,7 @@ class LineCounter:
def feed(self, token, test_newline=True): def feed(self, token, test_newline=True):
"""Consume a token and calculate the new line & column. """Consume a token and calculate the new line & column.


As an optional optimization, set test_newline=False is token doesn't contain a newline.
As an optional optimization, set test_newline=False if token doesn't contain a newline.
""" """
if test_newline: if test_newline:
newlines = token.count(self.newline_char) newlines = token.count(self.newline_char)
@@ -243,7 +245,6 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes)
except AssertionError: # Yes, this is what Python provides us.. :/ except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)


# terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
terminals = terminals[max_size:] terminals = terminals[max_size:]
return mres return mres
@@ -269,6 +270,10 @@ class Lexer(object):
""" """
lex = NotImplemented lex = NotImplemented


def make_lexer_state(self, text):
line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n')
return LexerState(text, line_ctr)



class TraditionalLexer(Lexer): class TraditionalLexer(Lexer):


@@ -328,26 +333,21 @@ class TraditionalLexer(Lexer):
if m: if m:
return m.group(0), type_from_index[m.lastindex] return m.group(0), type_from_index[m.lastindex]


def make_lexer_state(self, text):
line_ctr = LineCounter('\n' if not self.use_bytes else b'\n')
return LexerState(text, line_ctr)

def lex(self, text):
state = self.make_lexer_state(text)
def lex(self, state, parser_state):
with suppress(EOFError): with suppress(EOFError):
while True: while True:
yield self.next_token(state) yield self.next_token(state)


def next_token(self, lex_state): def next_token(self, lex_state):
text = lex_state.text
line_ctr = lex_state.line_ctr line_ctr = lex_state.line_ctr
while line_ctr.char_pos < len(text):
res = self.match(text, line_ctr.char_pos)
while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos)
if not res: if not res:
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
if not allowed: if not allowed:
allowed = {"<END-OF-FILE>"} allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token])
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token])


value, type_ = res value, type_ = res


@@ -373,11 +373,15 @@ class TraditionalLexer(Lexer):
raise EOFError(self) raise EOFError(self)


class LexerState: class LexerState:
__slots__ = 'text', 'line_ctr', 'last_token'

def __init__(self, text, line_ctr, last_token=None): def __init__(self, text, line_ctr, last_token=None):
self.text = text self.text = text
self.line_ctr = line_ctr self.line_ctr = line_ctr
self.last_token = last_token self.last_token = last_token


def __copy__(self):
return type(self)(self.text, copy(self.line_ctr), self.last_token)


class ContextualLexer(Lexer): class ContextualLexer(Lexer):


@@ -410,24 +414,29 @@ class ContextualLexer(Lexer):
assert trad_conf.tokens is terminals assert trad_conf.tokens is terminals
self.root_lexer = TraditionalLexer(trad_conf) self.root_lexer = TraditionalLexer(trad_conf)


def lex(self, text, get_parser_state):
state = self.root_lexer.make_lexer_state(text)
def make_lexer_state(self, text):
return self.root_lexer.make_lexer_state(text)

def lex(self, lexer_state, parser_state):
try: try:
while True: while True:
lexer = self.lexers[get_parser_state()]
yield lexer.next_token(state)
lexer = self.lexers[parser_state.position]
yield lexer.next_token(lexer_state)
except EOFError: except EOFError:
pass pass
except UnexpectedCharacters as e: except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
# but not in the current context.
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
# This tests the input against the global context, to provide a nicer error. # This tests the input against the global context, to provide a nicer error.
root_match = self.root_lexer.match(text, e.pos_in_stream)
if not root_match:
raise
token = self.root_lexer.next_token(lexer_state)
raise UnexpectedToken(token, e.allowed, state=parser_state.position)

class LexerThread:
"A thread that ties a lexer instance and a lexer state, to be used by the parser"


value, type_ = root_match
t = Token(type_, value, e.pos_in_stream, e.line, e.column)
raise UnexpectedToken(t, e.allowed, state=get_parser_state())
def __init__(self, lexer, text):
self.lexer = lexer
self.state = lexer.make_lexer_state(text)


def lex(self, parser_state):
return self.lexer.lex(self.state, parser_state)
###} ###}

+ 33
- 28
lark/parser_frontends.py View File

@@ -1,6 +1,6 @@
from .utils import get_regexp_width, Serialize from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .parsers import earley, xearley, cyk from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule from .grammar import Rule
@@ -23,12 +23,18 @@ def get_frontend(parser, lexer):
elif lexer == 'contextual': elif lexer == 'contextual':
return LALR_ContextualLexer return LALR_ContextualLexer
elif issubclass(lexer, Lexer): elif issubclass(lexer, Lexer):
class CustomLexerWrapper(Lexer):
def __init__(self, lexer_conf):
self.lexer = lexer(lexer_conf)
def lex(self, lexer_state, parser_state):
return self.lexer.lex(lexer_state.text)

class LALR_CustomLexerWrapper(LALR_CustomLexer): class LALR_CustomLexerWrapper(LALR_CustomLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
super(LALR_CustomLexerWrapper, self).__init__( super(LALR_CustomLexerWrapper, self).__init__(
lexer, lexer_conf, parser_conf, options=options) lexer, lexer_conf, parser_conf, options=options)
def init_lexer(self): def init_lexer(self):
self.lexer = lexer(self.lexer_conf)
self.lexer = CustomLexerWrapper(self.lexer_conf)


return LALR_CustomLexerWrapper return LALR_CustomLexerWrapper
else: else:
@@ -54,7 +60,7 @@ def get_frontend(parser, lexer):




class _ParserFrontend(Serialize): class _ParserFrontend(Serialize):
def _parse(self, input, start, *args):
def _parse(self, start, input, *args):
if start is None: if start is None:
start = self.start start = self.start
if len(start) > 1: if len(start) > 1:
@@ -71,6 +77,18 @@ def _get_lexer_callbacks(transformer, terminals):
result[terminal.name] = callback result[terminal.name] = callback
return result return result


class PostLexConnector:
def __init__(self, lexer, postlexer):
self.lexer = lexer
self.postlexer = postlexer

def make_lexer_state(self, text):
return self.lexer.make_lexer_state(text)

def lex(self, lexer_state, parser_state):
i = self.lexer.lex(lexer_state, parser_state)
return self.postlexer.process(i)



class WithLexer(_ParserFrontend): class WithLexer(_ParserFrontend):
lexer = None lexer = None
@@ -106,13 +124,14 @@ class WithLexer(_ParserFrontend):
def _serialize(self, data, memo): def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo) data['parser'] = data['parser'].serialize(memo)


def lex(self, *args):
stream = self.lexer.lex(*args)
return self.postlex.process(stream) if self.postlex else stream
def make_lexer(self, text):
lexer = self.lexer
if self.postlex:
lexer = PostLexConnector(self.lexer, self.postlex)
return LexerThread(lexer, text)


def parse(self, text, start=None): def parse(self, text, start=None):
token_stream = self.lex(text)
return self._parse(token_stream, start)
return self._parse(start, self.make_lexer(text))


def init_traditional_lexer(self): def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf) self.lexer = TraditionalLexer(self.lexer_conf)
@@ -138,14 +157,6 @@ class LALR_ContextualLexer(LALR_WithLexer):
always_accept = self.postlex.always_accept if self.postlex else () always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)



def parse(self, text, start=None):
parser_state = [None]
def set_parser_state(s):
parser_state[0] = s

token_stream = self.lex(text, lambda: parser_state[0])
return self._parse(token_stream, start, set_parser_state)
###} ###}


class LALR_CustomLexer(LALR_WithLexer): class LALR_CustomLexer(LALR_WithLexer):
@@ -156,15 +167,6 @@ class LALR_CustomLexer(LALR_WithLexer):
WithLexer.__init__(self, lexer_conf, parser_conf, options) WithLexer.__init__(self, lexer_conf, parser_conf, options)




def tokenize_text(text):
line = 1
col_start_pos = 0
for i, ch in enumerate(text):
if '\n' in ch:
line += ch.count('\n')
col_start_pos = i + ch.rindex('\n')
yield Token('CHAR', ch, line=line, column=i - col_start_pos)

class Earley(WithLexer): class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options) WithLexer.__init__(self, lexer_conf, parser_conf, options)
@@ -175,6 +177,9 @@ class Earley(WithLexer):
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class)


def make_lexer(self, text):
return WithLexer.make_lexer(self, text).lex(None)

def match(self, term, token): def match(self, term, token):
return term.name == token.type return term.name == token.type


@@ -219,7 +224,7 @@ class XEarley(_ParserFrontend):
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)


def parse(self, text, start): def parse(self, text, start):
return self._parse(text, start)
return self._parse(start, text)


class XEarley_CompleteLex(XEarley): class XEarley_CompleteLex(XEarley):
def __init__(self, *args, **kw): def __init__(self, *args, **kw):
@@ -239,8 +244,8 @@ class CYK(WithLexer):
self.callbacks = parser_conf.callbacks self.callbacks = parser_conf.callbacks


def parse(self, text, start): def parse(self, text, start):
tokens = list(self.lex(text))
parse = self._parse(tokens, start)
tokens = list(self.make_lexer(text).lex(None))
parse = self._parse(start, tokens)
parse = self._transform(parse) parse = self._transform(parse)
return parse return parse




+ 90
- 58
lark/parsers/lalr_parser.py View File

@@ -2,9 +2,9 @@
""" """
# Author: Erez Shinan (2017) # Author: Erez Shinan (2017)
# Email : erezshin@gmail.com # Email : erezshin@gmail.com
from ..exceptions import UnexpectedToken
from copy import deepcopy
from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
from ..lexer import Token from ..lexer import Token
from ..utils import Enumerator, Serialize


from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_puppet import ParserPuppet from .lalr_puppet import ParserPuppet
@@ -35,84 +35,116 @@ class LALR_Parser(object):
return self.parser.parse(*args) return self.parser.parse(*args)




class _Parser:
def __init__(self, parse_table, callbacks, debug=False):
self.parse_table = parse_table
self.callbacks = callbacks
self.debug = debug
class ParserState:
__slots__ = 'parse_table', 'lexer', 'callbacks', 'start', 'state_stack', 'value_stack', 'start_state', 'end_state', 'states'


def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None):
token = None
stream = iter(seq)
states = self.parse_table.states
start_state = self.parse_table.start_states[start]
end_state = self.parse_table.end_states[start]
def __init__(self, parse_table, lexer, callbacks, start, state_stack=None, value_stack=None):
self.parse_table = parse_table


state_stack = state_stack or [start_state]
value_stack = value_stack or []
self.start_state = self.parse_table.start_states[start]
self.end_state = self.parse_table.end_states[start]
self.states = self.parse_table.states


if set_state: set_state(start_state)
self.lexer = lexer
self.callbacks = callbacks
self.start = start
self.state_stack = state_stack or [self.start_state]
self.value_stack = value_stack or []

@property
def position(self):
return self.state_stack[-1]

def __copy__(self):
return type(self)(
self.parse_table,
self.lexer, # XXX copy
self.callbacks,
self.start,
list(self.state_stack),
deepcopy(self.value_stack),
)

def feed_token(self, token, is_end=False):
state_stack = self.state_stack
value_stack = self.value_stack
states = self.states


def get_action(token):
while True:
state = state_stack[-1] state = state_stack[-1]
try: try:
return states[state][token.type]
action, arg = states[state][token.type]
except KeyError: except KeyError:
expected = {s for s in states[state].keys() if s.isupper()} expected = {s for s in states[state].keys() if s.isupper()}
try:
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
except NameError: # For standalone parser
puppet = None
raise UnexpectedToken(token, expected, state=state, puppet=puppet)

def reduce(rule):
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
raise UnexpectedToken(token, expected, state=state, puppet=None)

assert arg != self.end_state

if action is Shift:
# shift once and return
assert not is_end
state_stack.append(arg)
value_stack.append(token)
return arg
else: else:
s = []
# reduce+shift as many times as necessary
rule = arg
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []

value = self.callbacks[rule](s)

_action, new_state = states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)

if is_end and state_stack[-1] == self.end_state:
return value_stack[-1]


value = self.callbacks[rule](s)
class _Parser:
def __init__(self, parse_table, callbacks, debug=False):
self.parse_table = parse_table
self.callbacks = callbacks
self.debug = debug


_action, new_state = states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)
def parse(self, lexer, start, value_stack=None, state_stack=None):
parser_state = ParserState(self.parse_table, lexer, self.callbacks, start, state_stack, value_stack)
return self.parse_from_state(parser_state)


def parse_from_state(self, state):
# Main LALR-parser loop # Main LALR-parser loop
try: try:
for token in stream:
while True:
action, arg = get_action(token)
assert arg != end_state

if action is Shift:
state_stack.append(arg)
value_stack.append(token)
if set_state: set_state(arg)
break # next token
else:
reduce(arg)
token = None
for token in state.lexer.lex(state):
state.feed_token(token)

token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(token, True)
except UnexpectedInput as e:
try:
e.puppet = ParserPuppet(self, state, state.lexer)
except NameError:
pass
if isinstance(e, UnexpectedCharacters):
s = state.lexer.state
p = s.line_ctr.char_pos
s.line_ctr.feed(s.text[p:p+1])
raise e
except Exception as e: except Exception as e:
if self.debug: if self.debug:
print("") print("")
print("STATE STACK DUMP") print("STATE STACK DUMP")
print("----------------") print("----------------")
for i, s in enumerate(state_stack):
for i, s in enumerate(state.state_stack):
print('%d)' % i , s) print('%d)' % i , s)
print("") print("")


raise raise

token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
while True:
_action, arg = get_action(token)
assert(_action is Reduce)
reduce(arg)
if state_stack[-1] == end_state:
return value_stack[-1]

###} ###}



+ 17
- 72
lark/parsers/lalr_puppet.py View File

@@ -1,10 +1,10 @@
# This module provide a LALR puppet, which is used to debugging and error handling # This module provide a LALR puppet, which is used to debugging and error handling


from copy import deepcopy
from copy import copy


from .lalr_analysis import Shift, Reduce from .lalr_analysis import Shift, Reduce
from .. import Token from .. import Token
from ..exceptions import ParseError
from ..exceptions import UnexpectedToken




class ParserPuppet(object): class ParserPuppet(object):
@@ -12,96 +12,44 @@ class ParserPuppet(object):


For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``. For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``.
""" """
def __init__(self, parser, state_stack, value_stack, start, stream, set_state):
def __init__(self, parser, parser_state, lexer_state):
self.parser = parser self.parser = parser
self._state_stack = state_stack
self._value_stack = value_stack
self._start = start
self._stream = stream
self._set_state = set_state

self.result = None
self.parser_state = parser_state
self.lexer_state = lexer_state


def feed_token(self, token): def feed_token(self, token):
"""Feed the parser with a token, and advance it to the next state, as if it recieved it from the lexer. """Feed the parser with a token, and advance it to the next state, as if it recieved it from the lexer.


Note that ``token`` has to be an instance of ``Token``. Note that ``token`` has to be an instance of ``Token``.
""" """
end_state = self.parser.parse_table.end_states[self._start]
state_stack = self._state_stack
value_stack = self._value_stack

state = state_stack[-1]
action, arg = self.parser.parse_table.states[state][token.type]
if arg == end_state:
raise ParseError(arg)

while action is Reduce:
rule = arg
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []

value = self.parser.callbacks[rule](s)

_action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)

if state_stack[-1] == end_state:
self.result = value_stack[-1]
return self.result

state = state_stack[-1]
try:
action, arg = self.parser.parse_table.states[state][token.type]
except KeyError as e:
raise ParseError(e)
assert arg != end_state

assert action is Shift
state_stack.append(arg)
value_stack.append(token)

def copy(self):
return self.parser_state.feed_token(token)

def __copy__(self):
"""Create a new puppet with a separate state. """Create a new puppet with a separate state.


Calls to feed_token() won't affect the old puppet, and vice-versa. Calls to feed_token() won't affect the old puppet, and vice-versa.
""" """
return type(self)( return type(self)(
self.parser, self.parser,
list(self._state_stack),
deepcopy(self._value_stack),
self._start,
self._stream,
self._set_state,
copy(self.parser_state),
copy(self.lexer_state),
) )


def __eq__(self, other): def __eq__(self, other):
if not isinstance(other, ParserPuppet): if not isinstance(other, ParserPuppet):
return False return False


return (
self._state_stack == other._state_stack and
self._value_stack == other._value_stack and
self._stream == other._stream and
self._start == other._start
)
return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state


def __hash__(self): def __hash__(self):
return hash((tuple(self._state_stack), self._start))
return hash((self.parser_state, self.lexer_state))


def pretty(self): def pretty(self):
"""Print the output of ``choices()`` in a way that's easier to read.""" """Print the output of ``choices()`` in a way that's easier to read."""
out = ["Puppet choices:"] out = ["Puppet choices:"]
for k, v in self.choices().items(): for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v)) out.append('\t- %s -> %s' % (k, v))
out.append('stack size: %s' % len(self._state_stack))
out.append('stack size: %s' % len(self.parser_state.state_stack))
return '\n'.join(out) return '\n'.join(out)


def choices(self): def choices(self):
@@ -111,16 +59,16 @@ class ParserPuppet(object):


Updated by ``feed_token()``. Updated by ``feed_token()``.
""" """
return self.parser.parse_table.states[self._state_stack[-1]]
return self.parser_state.parse_table.states[self.parser_state.position]


def accepts(self): def accepts(self):
accepts = set() accepts = set()
for t in self.choices(): for t in self.choices():
if t.isupper(): # is terminal? if t.isupper(): # is terminal?
new_puppet = self.copy()
new_puppet = copy(self)
try: try:
new_puppet.feed_token(Token(t, '')) new_puppet.feed_token(Token(t, ''))
except ParseError:
except UnexpectedToken:
pass pass
else: else:
accepts.add(t) accepts.add(t)
@@ -128,7 +76,4 @@ class ParserPuppet(object):


def resume_parse(self): def resume_parse(self):
"""Resume parsing from the current puppet state.""" """Resume parsing from the current puppet state."""
return self.parser.parse(
self._stream, self._start, self._set_state,
self._value_stack, self._state_stack
)
return self.parser.parse_from_state(self.parser_state)

+ 36
- 0
tests/test_parser.py View File

@@ -2217,6 +2217,42 @@ def _make_parser_test(LEXER, PARSER):
""", regex=True) """, regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')



@unittest.skipIf(PARSER!='lalr', "Puppet error handling only works with LALR for now")
def test_error_with_puppet(self):
def ignore_errors(e):
if isinstance(e, UnexpectedCharacters):
# Skip bad character
return True

# Must be UnexpectedToken
if e.token.type == 'COMMA':
# Skip comma
return True
elif e.token.type == 'SIGNED_NUMBER':
# Try to feed a comma and retry the number
e.puppet.feed_token(Token('COMMA', ','))
e.puppet.feed_token(e.token)
return True

# Unhandled error. Will stop parse and raise exception
return False

g = _Lark(r'''
start: "[" num ("," num)* "]"
?num: SIGNED_NUMBER
%import common.SIGNED_NUMBER
%ignore " "
''')
s = "[0 1, 2,, 3,,, 4, 5 6 ]"
tree = g.parse(s, on_error=ignore_errors)
res = [int(x) for x in tree.children]
assert res == list(range(7))

s = "[0 1, 2,@, 3,,, 4, 5 6 ]$"
tree = g.parse(s, on_error=ignore_errors)


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME _TestParser.__name__ = _NAME
_TestParser.__qualname__ = "tests.test_parser." + _NAME _TestParser.__qualname__ = "tests.test_parser." + _NAME


Loading…
Cancel
Save