Browse Source

Merge pull request #725 from lark-parser/resumable3

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.0
Erez Shinan 3 years ago
committed by GitHub
parent
commit
6d6e22048d
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 282 additions and 230 deletions
  1. +16
    -3
      lark/lark.py
  2. +79
    -70
      lark/lexer.py
  3. +37
    -28
      lark/parser_frontends.py
  4. +95
    -56
      lark/parsers/lalr_parser.py
  5. +19
    -73
      lark/parsers/lalr_puppet.py
  6. +36
    -0
      tests/test_parser.py

+ 16
- 3
lark/lark.py View File

@@ -1,4 +1,5 @@
from __future__ import absolute_import from __future__ import absolute_import
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken


import sys, os, pickle, hashlib import sys, os, pickle, hashlib
from io import open from io import open
@@ -9,7 +10,7 @@ from .load_grammar import load_grammar
from .tree import Tree from .tree import Tree
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf


from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken
from .lexer import Lexer, TraditionalLexer, TerminalDef
from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend, _get_lexer_callbacks from .parser_frontends import get_frontend, _get_lexer_callbacks
from .grammar import Rule from .grammar import Rule
@@ -462,20 +463,32 @@ class Lark(Serialize):


try: try:
return self.parser.parse(text, start=start) return self.parser.parse(text, start=start)
except UnexpectedToken as e:
except UnexpectedInput as e:
if on_error is None: if on_error is None:
raise raise


while True: while True:
if isinstance(e, UnexpectedCharacters):
s = e.puppet.lexer_state.state
p = s.line_ctr.char_pos

if not on_error(e): if not on_error(e):
raise e raise e

if isinstance(e, UnexpectedCharacters):
# If user didn't change the character position, then we should
if p == s.line_ctr.char_pos:
s.line_ctr.feed(s.text[p:p+1])

try: try:
return e.puppet.resume_parse() return e.puppet.resume_parse()
except UnexpectedToken as e2: except UnexpectedToken as e2:
if e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
# Prevent infinite loop # Prevent infinite loop
raise e2 raise e2
e = e2 e = e2
except UnexpectedCharacters as e2:
e = e2




###} ###}

+ 79
- 70
lark/lexer.py View File

@@ -2,7 +2,7 @@


import re import re


from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .utils import Str, classify, get_regexp_width, Py36, Serialize, suppress
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken


###{standalone ###{standalone
@@ -157,6 +157,8 @@ class Token(Str):




class LineCounter: class LineCounter:
__slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char'

def __init__(self, newline_char): def __init__(self, newline_char):
self.newline_char = newline_char self.newline_char = newline_char
self.char_pos = 0 self.char_pos = 0
@@ -167,7 +169,7 @@ class LineCounter:
def feed(self, token, test_newline=True): def feed(self, token, test_newline=True):
"""Consume a token and calculate the new line & column. """Consume a token and calculate the new line & column.


As an optional optimization, set test_newline=False is token doesn't contain a newline.
As an optional optimization, set test_newline=False if token doesn't contain a newline.
""" """
if test_newline: if test_newline:
newlines = token.count(self.newline_char) newlines = token.count(self.newline_char)
@@ -178,49 +180,6 @@ class LineCounter:
self.char_pos += len(token) self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos + 1 self.column = self.char_pos - self.line_start_pos + 1


class _Lex:
"Built to serve both Lexer and ContextualLexer"
def __init__(self, lexer, state=None):
self.lexer = lexer
self.state = state

def lex(self, stream, newline_types, ignore_types):
newline_types = frozenset(newline_types)
ignore_types = frozenset(ignore_types)
line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n')
last_token = None

while line_ctr.char_pos < len(stream):
lexer = self.lexer
res = lexer.match(stream, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token])

value, type_ = res

if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
line_ctr.feed(value, type_ in newline_types)
t.end_line = line_ctr.line
t.end_column = line_ctr.column
t.end_pos = line_ctr.char_pos
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
yield t
last_token = t
else:
if type_ in lexer.callback:
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
lexer.callback[type_](t2)
line_ctr.feed(value, type_ in newline_types)





class UnlessCallback: class UnlessCallback:
def __init__(self, mres): def __init__(self, mres):
@@ -286,7 +245,6 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes)
except AssertionError: # Yes, this is what Python provides us.. :/ except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)


# terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
terminals = terminals[max_size:] terminals = terminals[max_size:]
return mres return mres
@@ -308,10 +266,14 @@ class Lexer(object):
"""Lexer interface """Lexer interface


Method Signatures: Method Signatures:
lex(self, stream) -> Iterator[Token]
lex(self, text) -> Iterator[Token]
""" """
lex = NotImplemented lex = NotImplemented


def make_lexer_state(self, text):
line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n')
return LexerState(text, line_ctr)



class TraditionalLexer(Lexer): class TraditionalLexer(Lexer):


@@ -335,8 +297,8 @@ class TraditionalLexer(Lexer):
assert set(conf.ignore) <= {t.name for t in terminals} assert set(conf.ignore) <= {t.name for t in terminals}


# Init # Init
self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = list(conf.ignore)
self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp()))
self.ignore_types = frozenset(conf.ignore)


terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
self.terminals = terminals self.terminals = terminals
@@ -345,7 +307,6 @@ class TraditionalLexer(Lexer):
self.use_bytes = conf.use_bytes self.use_bytes = conf.use_bytes


self._mres = None self._mres = None
# self.build(g_regex_flags)


def _build(self): def _build(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes)
@@ -366,17 +327,61 @@ class TraditionalLexer(Lexer):
self._build() self._build()
return self._mres return self._mres


def match(self, stream, pos):
def match(self, text, pos):
for mre, type_from_index in self.mres: for mre, type_from_index in self.mres:
m = mre.match(stream, pos)
m = mre.match(text, pos)
if m: if m:
return m.group(0), type_from_index[m.lastindex] return m.group(0), type_from_index[m.lastindex]


def lex(self, stream):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
def lex(self, state, parser_state):
with suppress(EOFError):
while True:
yield self.next_token(state)

def next_token(self, lex_state):
line_ctr = lex_state.line_ctr
while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token])

value, type_ = res

if type_ not in self.ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
line_ctr.feed(value, type_ in self.newline_types)
t.end_line = line_ctr.line
t.end_column = line_ctr.column
t.end_pos = line_ctr.char_pos
if t.type in self.callback:
t = self.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
lex_state.last_token = t
return t
else:
if type_ in self.callback:
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
self.callback[type_](t2)
line_ctr.feed(value, type_ in self.newline_types)

# EOF
raise EOFError(self)


class LexerState:
__slots__ = 'text', 'line_ctr', 'last_token'


def __init__(self, text, line_ctr, last_token=None):
self.text = text
self.line_ctr = line_ctr
self.last_token = last_token


def __copy__(self):
return type(self)(self.text, copy(self.line_ctr), self.last_token)


class ContextualLexer(Lexer): class ContextualLexer(Lexer):


@@ -409,25 +414,29 @@ class ContextualLexer(Lexer):
assert trad_conf.tokens is terminals assert trad_conf.tokens is terminals
self.root_lexer = TraditionalLexer(trad_conf) self.root_lexer = TraditionalLexer(trad_conf)


def lex(self, stream, get_parser_state):
parser_state = get_parser_state()
l = _Lex(self.lexers[parser_state], parser_state)
def make_lexer_state(self, text):
return self.root_lexer.make_lexer_state(text)

def lex(self, lexer_state, parser_state):
try: try:
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
parser_state = get_parser_state()
l.lexer = self.lexers[parser_state]
l.state = parser_state # For debug only, no need to worry about multithreading
while True:
lexer = self.lexers[parser_state.position]
yield lexer.next_token(lexer_state)
except EOFError:
pass
except UnexpectedCharacters as e: except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
# but not in the current context.
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
# This tests the input against the global context, to provide a nicer error. # This tests the input against the global context, to provide a nicer error.
root_match = self.root_lexer.match(stream, e.pos_in_stream)
if not root_match:
raise
token = self.root_lexer.next_token(lexer_state)
raise UnexpectedToken(token, e.allowed, state=parser_state.position)


value, type_ = root_match
t = Token(type_, value, e.pos_in_stream, e.line, e.column)
raise UnexpectedToken(t, e.allowed, state=e.state)
class LexerThread:
"A thread that ties a lexer instance and a lexer state, to be used by the parser"

def __init__(self, lexer, text):
self.lexer = lexer
self.state = lexer.make_lexer_state(text)


def lex(self, parser_state):
return self.lexer.lex(self.state, parser_state)
###} ###}

+ 37
- 28
lark/parser_frontends.py View File

@@ -1,6 +1,6 @@
from .utils import get_regexp_width, Serialize from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .parsers import earley, xearley, cyk from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule from .grammar import Rule
@@ -23,12 +23,22 @@ def get_frontend(parser, lexer):
elif lexer == 'contextual': elif lexer == 'contextual':
return LALR_ContextualLexer return LALR_ContextualLexer
elif issubclass(lexer, Lexer): elif issubclass(lexer, Lexer):
class CustomLexerWrapper(Lexer):
def __init__(self, lexer_conf):
self.lexer = lexer(lexer_conf)
def lex(self, lexer_state, parser_state):
return self.lexer.lex(lexer_state.text)

class LALR_CustomLexerWrapper(LALR_CustomLexer): class LALR_CustomLexerWrapper(LALR_CustomLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
super(LALR_CustomLexerWrapper, self).__init__( super(LALR_CustomLexerWrapper, self).__init__(
lexer, lexer_conf, parser_conf, options=options) lexer, lexer_conf, parser_conf, options=options)
def init_lexer(self): def init_lexer(self):
self.lexer = lexer(self.lexer_conf)
future_interface = getattr(lexer, '__future_interface__', False)
if future_interface:
self.lexer = lexer(self.lexer_conf)
else:
self.lexer = CustomLexerWrapper(self.lexer_conf)


return LALR_CustomLexerWrapper return LALR_CustomLexerWrapper
else: else:
@@ -54,7 +64,7 @@ def get_frontend(parser, lexer):




class _ParserFrontend(Serialize): class _ParserFrontend(Serialize):
def _parse(self, input, start, *args):
def _parse(self, start, input, *args):
if start is None: if start is None:
start = self.start start = self.start
if len(start) > 1: if len(start) > 1:
@@ -71,6 +81,18 @@ def _get_lexer_callbacks(transformer, terminals):
result[terminal.name] = callback result[terminal.name] = callback
return result return result


class PostLexConnector:
def __init__(self, lexer, postlexer):
self.lexer = lexer
self.postlexer = postlexer

def make_lexer_state(self, text):
return self.lexer.make_lexer_state(text)

def lex(self, lexer_state, parser_state):
i = self.lexer.lex(lexer_state, parser_state)
return self.postlexer.process(i)



class WithLexer(_ParserFrontend): class WithLexer(_ParserFrontend):
lexer = None lexer = None
@@ -106,13 +128,14 @@ class WithLexer(_ParserFrontend):
def _serialize(self, data, memo): def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo) data['parser'] = data['parser'].serialize(memo)


def lex(self, *args):
stream = self.lexer.lex(*args)
return self.postlex.process(stream) if self.postlex else stream
def make_lexer(self, text):
lexer = self.lexer
if self.postlex:
lexer = PostLexConnector(self.lexer, self.postlex)
return LexerThread(lexer, text)


def parse(self, text, start=None): def parse(self, text, start=None):
token_stream = self.lex(text)
return self._parse(token_stream, start)
return self._parse(start, self.make_lexer(text))


def init_traditional_lexer(self): def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf) self.lexer = TraditionalLexer(self.lexer_conf)
@@ -138,14 +161,6 @@ class LALR_ContextualLexer(LALR_WithLexer):
always_accept = self.postlex.always_accept if self.postlex else () always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)



def parse(self, text, start=None):
parser_state = [None]
def set_parser_state(s):
parser_state[0] = s

token_stream = self.lex(text, lambda: parser_state[0])
return self._parse(token_stream, start, set_parser_state)
###} ###}


class LALR_CustomLexer(LALR_WithLexer): class LALR_CustomLexer(LALR_WithLexer):
@@ -156,15 +171,6 @@ class LALR_CustomLexer(LALR_WithLexer):
WithLexer.__init__(self, lexer_conf, parser_conf, options) WithLexer.__init__(self, lexer_conf, parser_conf, options)




def tokenize_text(text):
line = 1
col_start_pos = 0
for i, ch in enumerate(text):
if '\n' in ch:
line += ch.count('\n')
col_start_pos = i + ch.rindex('\n')
yield Token('CHAR', ch, line=line, column=i - col_start_pos)

class Earley(WithLexer): class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options) WithLexer.__init__(self, lexer_conf, parser_conf, options)
@@ -175,6 +181,9 @@ class Earley(WithLexer):
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class)


def make_lexer(self, text):
return WithLexer.make_lexer(self, text).lex(None)

def match(self, term, token): def match(self, term, token):
return term.name == token.type return term.name == token.type


@@ -219,7 +228,7 @@ class XEarley(_ParserFrontend):
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)


def parse(self, text, start): def parse(self, text, start):
return self._parse(text, start)
return self._parse(start, text)


class XEarley_CompleteLex(XEarley): class XEarley_CompleteLex(XEarley):
def __init__(self, *args, **kw): def __init__(self, *args, **kw):
@@ -239,8 +248,8 @@ class CYK(WithLexer):
self.callbacks = parser_conf.callbacks self.callbacks = parser_conf.callbacks


def parse(self, text, start): def parse(self, text, start):
tokens = list(self.lex(text))
parse = self._parse(tokens, start)
tokens = list(self.make_lexer(text).lex(None))
parse = self._parse(start, tokens)
parse = self._transform(parse) parse = self._transform(parse)
return parse return parse




+ 95
- 56
lark/parsers/lalr_parser.py View File

@@ -2,9 +2,9 @@
""" """
# Author: Erez Shinan (2017) # Author: Erez Shinan (2017)
# Email : erezshin@gmail.com # Email : erezshin@gmail.com
from ..exceptions import UnexpectedToken
from copy import deepcopy, copy
from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
from ..lexer import Token from ..lexer import Token
from ..utils import Enumerator, Serialize


from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_puppet import ParserPuppet from .lalr_puppet import ParserPuppet
@@ -35,84 +35,123 @@ class LALR_Parser(object):
return self.parser.parse(*args) return self.parser.parse(*args)




class _Parser:
def __init__(self, parse_table, callbacks, debug=False):
class ParseConf:
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'

def __init__(self, parse_table, callbacks, start):
self.parse_table = parse_table self.parse_table = parse_table

self.start_state = self.parse_table.start_states[start]
self.end_state = self.parse_table.end_states[start]
self.states = self.parse_table.states

self.callbacks = callbacks self.callbacks = callbacks
self.debug = debug
self.start = start


def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None):
token = None
stream = iter(seq)
states = self.parse_table.states
start_state = self.parse_table.start_states[start]
end_state = self.parse_table.end_states[start]


state_stack = state_stack or [start_state]
value_stack = value_stack or []
class ParserState:
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'


if set_state: set_state(start_state)
def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None):
self.parse_conf = parse_conf
self.lexer = lexer
self.state_stack = state_stack or [self.parse_conf.start_state]
self.value_stack = value_stack or []


def get_action(token):
@property
def position(self):
return self.state_stack[-1]

def __copy__(self):
return type(self)(
self.parse_conf,
self.lexer, # XXX copy
copy(self.state_stack),
deepcopy(self.value_stack),
)

def copy(self):
return copy(self)

def feed_token(self, token, is_end=False):
state_stack = self.state_stack
value_stack = self.value_stack
states = self.parse_conf.states
end_state = self.parse_conf.end_state
callbacks = self.parse_conf.callbacks

while True:
state = state_stack[-1] state = state_stack[-1]
try: try:
return states[state][token.type]
action, arg = states[state][token.type]
except KeyError: except KeyError:
expected = {s for s in states[state].keys() if s.isupper()} expected = {s for s in states[state].keys() if s.isupper()}
try:
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
except NameError: # For standalone parser
puppet = None
raise UnexpectedToken(token, expected, state=state, puppet=puppet)

def reduce(rule):
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
raise UnexpectedToken(token, expected, state=state, puppet=None)

assert arg != end_state

if action is Shift:
# shift once and return
assert not is_end
state_stack.append(arg)
value_stack.append(token)
return arg
else: else:
s = []
# reduce+shift as many times as necessary
rule = arg
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []

value = callbacks[rule](s)

_action, new_state = states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)

if is_end and state_stack[-1] == end_state:
return value_stack[-1]


value = self.callbacks[rule](s)
class _Parser:
def __init__(self, parse_table, callbacks, debug=False):
self.parse_table = parse_table
self.callbacks = callbacks
self.debug = debug


_action, new_state = states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)
def parse(self, lexer, start, value_stack=None, state_stack=None):
parse_conf = ParseConf(self.parse_table, self.callbacks, start)
parser_state = ParserState(parse_conf, lexer, state_stack, value_stack)
return self.parse_from_state(parser_state)


def parse_from_state(self, state):
# Main LALR-parser loop # Main LALR-parser loop
try: try:
for token in stream:
while True:
action, arg = get_action(token)
assert arg != end_state

if action is Shift:
state_stack.append(arg)
value_stack.append(token)
if set_state: set_state(arg)
break # next token
else:
reduce(arg)
token = None
for token in state.lexer.lex(state):
state.feed_token(token)
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(token, True)
except UnexpectedInput as e:
try:
e.puppet = ParserPuppet(self, state, state.lexer)
except NameError:
pass
raise e
except Exception as e: except Exception as e:
if self.debug: if self.debug:
print("") print("")
print("STATE STACK DUMP") print("STATE STACK DUMP")
print("----------------") print("----------------")
for i, s in enumerate(state_stack):
for i, s in enumerate(state.state_stack):
print('%d)' % i , s) print('%d)' % i , s)
print("") print("")


raise raise

token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
while True:
_action, arg = get_action(token)
assert(_action is Reduce)
reduce(arg)
if state_stack[-1] == end_state:
return value_stack[-1]

###} ###}



+ 19
- 73
lark/parsers/lalr_puppet.py View File

@@ -1,10 +1,10 @@
# This module provide a LALR puppet, which is used to debugging and error handling # This module provide a LALR puppet, which is used to debugging and error handling


from copy import deepcopy
from copy import copy


from .lalr_analysis import Shift, Reduce from .lalr_analysis import Shift, Reduce
from .. import Token from .. import Token
from ..exceptions import ParseError
from ..exceptions import UnexpectedToken




class ParserPuppet(object): class ParserPuppet(object):
@@ -12,96 +12,45 @@ class ParserPuppet(object):


For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``. For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``.
""" """
def __init__(self, parser, state_stack, value_stack, start, stream, set_state):
def __init__(self, parser, parser_state, lexer_state):
self.parser = parser self.parser = parser
self._state_stack = state_stack
self._value_stack = value_stack
self._start = start
self._stream = stream
self._set_state = set_state

self.result = None
self.parser_state = parser_state
self.lexer_state = lexer_state


def feed_token(self, token): def feed_token(self, token):
"""Feed the parser with a token, and advance it to the next state, as if it received it from the lexer. """Feed the parser with a token, and advance it to the next state, as if it received it from the lexer.


Note that ``token`` has to be an instance of ``Token``. Note that ``token`` has to be an instance of ``Token``.
""" """
end_state = self.parser.parse_table.end_states[self._start]
state_stack = self._state_stack
value_stack = self._value_stack

state = state_stack[-1]
action, arg = self.parser.parse_table.states[state][token.type]
if arg == end_state:
raise ParseError(arg)

while action is Reduce:
rule = arg
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []

value = self.parser.callbacks[rule](s)

_action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)

if state_stack[-1] == end_state:
self.result = value_stack[-1]
return self.result

state = state_stack[-1]
try:
action, arg = self.parser.parse_table.states[state][token.type]
except KeyError as e:
raise ParseError(e)
assert arg != end_state

assert action is Shift
state_stack.append(arg)
value_stack.append(token)

def copy(self):
return self.parser_state.feed_token(token)

def __copy__(self):
"""Create a new puppet with a separate state. """Create a new puppet with a separate state.


Calls to feed_token() won't affect the old puppet, and vice-versa. Calls to feed_token() won't affect the old puppet, and vice-versa.
""" """
return type(self)( return type(self)(
self.parser, self.parser,
list(self._state_stack),
deepcopy(self._value_stack),
self._start,
self._stream,
self._set_state,
copy(self.parser_state),
copy(self.lexer_state),
) )


def __eq__(self, other): def __eq__(self, other):
if not isinstance(other, ParserPuppet): if not isinstance(other, ParserPuppet):
return False return False


return (
self._state_stack == other._state_stack and
self._value_stack == other._value_stack and
self._stream == other._stream and
self._start == other._start
)
return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state


def __hash__(self):
return hash((tuple(self._state_stack), self._start))
# TODO Provide with an immutable puppet instance
# def __hash__(self):
# return hash((self.parser_state, self.lexer_state))


def pretty(self): def pretty(self):
"""Print the output of ``choices()`` in a way that's easier to read.""" """Print the output of ``choices()`` in a way that's easier to read."""
out = ["Puppet choices:"] out = ["Puppet choices:"]
for k, v in self.choices().items(): for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v)) out.append('\t- %s -> %s' % (k, v))
out.append('stack size: %s' % len(self._state_stack))
out.append('stack size: %s' % len(self.parser_state.state_stack))
return '\n'.join(out) return '\n'.join(out)


def choices(self): def choices(self):
@@ -111,16 +60,16 @@ class ParserPuppet(object):


Updated by ``feed_token()``. Updated by ``feed_token()``.
""" """
return self.parser.parse_table.states[self._state_stack[-1]]
return self.parser_state.parse_table.states[self.parser_state.position]


def accepts(self): def accepts(self):
accepts = set() accepts = set()
for t in self.choices(): for t in self.choices():
if t.isupper(): # is terminal? if t.isupper(): # is terminal?
new_puppet = self.copy()
new_puppet = copy(self)
try: try:
new_puppet.feed_token(Token(t, '')) new_puppet.feed_token(Token(t, ''))
except ParseError:
except UnexpectedToken:
pass pass
else: else:
accepts.add(t) accepts.add(t)
@@ -128,7 +77,4 @@ class ParserPuppet(object):


def resume_parse(self): def resume_parse(self):
"""Resume parsing from the current puppet state.""" """Resume parsing from the current puppet state."""
return self.parser.parse(
self._stream, self._start, self._set_state,
self._value_stack, self._state_stack
)
return self.parser.parse_from_state(self.parser_state)

+ 36
- 0
tests/test_parser.py View File

@@ -2217,6 +2217,42 @@ def _make_parser_test(LEXER, PARSER):
""", regex=True) """, regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')



@unittest.skipIf(PARSER!='lalr', "Puppet error handling only works with LALR for now")
def test_error_with_puppet(self):
def ignore_errors(e):
if isinstance(e, UnexpectedCharacters):
# Skip bad character
return True

# Must be UnexpectedToken
if e.token.type == 'COMMA':
# Skip comma
return True
elif e.token.type == 'SIGNED_NUMBER':
# Try to feed a comma and retry the number
e.puppet.feed_token(Token('COMMA', ','))
e.puppet.feed_token(e.token)
return True

# Unhandled error. Will stop parse and raise exception
return False

g = _Lark(r'''
start: "[" num ("," num)* "]"
?num: SIGNED_NUMBER
%import common.SIGNED_NUMBER
%ignore " "
''')
s = "[0 1, 2,, 3,,, 4, 5 6 ]"
tree = g.parse(s, on_error=ignore_errors)
res = [int(x) for x in tree.children]
assert res == list(range(7))

s = "[0 1, 2,@, 3,,, 4, 5 6 ]$"
tree = g.parse(s, on_error=ignore_errors)


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME _TestParser.__name__ = _NAME
_TestParser.__qualname__ = "tests.test_parser." + _NAME _TestParser.__qualname__ = "tests.test_parser." + _NAME


Loading…
Cancel
Save