Bladeren bron

Completely redesigned the interface between Lexer<->Parser, and refactored LALR parser and lexers

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.0
Erez Sh 4 jaren geleden
bovenliggende
commit
72e7926097
6 gewijzigde bestanden met toevoegingen van 215 en 185 verwijderingen
  1. +6
    -3
      lark/lark.py
  2. +33
    -24
      lark/lexer.py
  3. +33
    -28
      lark/parser_frontends.py
  4. +90
    -58
      lark/parsers/lalr_parser.py
  5. +17
    -72
      lark/parsers/lalr_puppet.py
  6. +36
    -0
      tests/test_parser.py

+ 6
- 3
lark/lark.py Bestand weergeven

@@ -1,4 +1,5 @@
from __future__ import absolute_import
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken

import sys, os, pickle, hashlib
from io import open
@@ -9,7 +10,7 @@ from .load_grammar import load_grammar
from .tree import Tree
from .common import LexerConf, ParserConf

from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken
from .lexer import Lexer, TraditionalLexer, TerminalDef
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend, _get_lexer_callbacks
from .grammar import Rule
@@ -462,7 +463,7 @@ class Lark(Serialize):

try:
return self.parser.parse(text, start=start)
except UnexpectedToken as e:
except UnexpectedInput as e:
if on_error is None:
raise

@@ -472,10 +473,12 @@ class Lark(Serialize):
try:
return e.puppet.resume_parse()
except UnexpectedToken as e2:
if e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
# Prevent infinite loop
raise e2
e = e2
except UnexpectedCharacters as e2:
e = e2


###}

+ 33
- 24
lark/lexer.py Bestand weergeven

@@ -157,6 +157,8 @@ class Token(Str):


class LineCounter:
__slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char'

def __init__(self, newline_char):
self.newline_char = newline_char
self.char_pos = 0
@@ -167,7 +169,7 @@ class LineCounter:
def feed(self, token, test_newline=True):
"""Consume a token and calculate the new line & column.

As an optional optimization, set test_newline=False is token doesn't contain a newline.
As an optional optimization, set test_newline=False if token doesn't contain a newline.
"""
if test_newline:
newlines = token.count(self.newline_char)
@@ -243,7 +245,6 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes)
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)

# terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
terminals = terminals[max_size:]
return mres
@@ -269,6 +270,10 @@ class Lexer(object):
"""
lex = NotImplemented

def make_lexer_state(self, text):
line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n')
return LexerState(text, line_ctr)


class TraditionalLexer(Lexer):

@@ -328,26 +333,21 @@ class TraditionalLexer(Lexer):
if m:
return m.group(0), type_from_index[m.lastindex]

def make_lexer_state(self, text):
line_ctr = LineCounter('\n' if not self.use_bytes else b'\n')
return LexerState(text, line_ctr)

def lex(self, text):
state = self.make_lexer_state(text)
def lex(self, state, parser_state):
with suppress(EOFError):
while True:
yield self.next_token(state)

def next_token(self, lex_state):
text = lex_state.text
line_ctr = lex_state.line_ctr
while line_ctr.char_pos < len(text):
res = self.match(text, line_ctr.char_pos)
while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token])
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token])

value, type_ = res

@@ -373,11 +373,15 @@ class TraditionalLexer(Lexer):
raise EOFError(self)

class LexerState:
__slots__ = 'text', 'line_ctr', 'last_token'

def __init__(self, text, line_ctr, last_token=None):
self.text = text
self.line_ctr = line_ctr
self.last_token = last_token

def __copy__(self):
return type(self)(self.text, copy(self.line_ctr), self.last_token)

class ContextualLexer(Lexer):

@@ -410,24 +414,29 @@ class ContextualLexer(Lexer):
assert trad_conf.tokens is terminals
self.root_lexer = TraditionalLexer(trad_conf)

def lex(self, text, get_parser_state):
state = self.root_lexer.make_lexer_state(text)
def make_lexer_state(self, text):
return self.root_lexer.make_lexer_state(text)

def lex(self, lexer_state, parser_state):
try:
while True:
lexer = self.lexers[get_parser_state()]
yield lexer.next_token(state)
lexer = self.lexers[parser_state.position]
yield lexer.next_token(lexer_state)
except EOFError:
pass
except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
# but not in the current context.
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
# This tests the input against the global context, to provide a nicer error.
root_match = self.root_lexer.match(text, e.pos_in_stream)
if not root_match:
raise
token = self.root_lexer.next_token(lexer_state)
raise UnexpectedToken(token, e.allowed, state=parser_state.position)

class LexerThread:
"A thread that ties a lexer instance and a lexer state, to be used by the parser"

value, type_ = root_match
t = Token(type_, value, e.pos_in_stream, e.line, e.column)
raise UnexpectedToken(t, e.allowed, state=get_parser_state())
def __init__(self, lexer, text):
self.lexer = lexer
self.state = lexer.make_lexer_state(text)

def lex(self, parser_state):
return self.lexer.lex(self.state, parser_state)
###}

+ 33
- 28
lark/parser_frontends.py Bestand weergeven

@@ -1,6 +1,6 @@
from .utils import get_regexp_width, Serialize
from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule
@@ -23,12 +23,18 @@ def get_frontend(parser, lexer):
elif lexer == 'contextual':
return LALR_ContextualLexer
elif issubclass(lexer, Lexer):
class CustomLexerWrapper(Lexer):
def __init__(self, lexer_conf):
self.lexer = lexer(lexer_conf)
def lex(self, lexer_state, parser_state):
return self.lexer.lex(lexer_state.text)

class LALR_CustomLexerWrapper(LALR_CustomLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
super(LALR_CustomLexerWrapper, self).__init__(
lexer, lexer_conf, parser_conf, options=options)
def init_lexer(self):
self.lexer = lexer(self.lexer_conf)
self.lexer = CustomLexerWrapper(self.lexer_conf)

return LALR_CustomLexerWrapper
else:
@@ -54,7 +60,7 @@ def get_frontend(parser, lexer):


class _ParserFrontend(Serialize):
def _parse(self, input, start, *args):
def _parse(self, start, input, *args):
if start is None:
start = self.start
if len(start) > 1:
@@ -71,6 +77,18 @@ def _get_lexer_callbacks(transformer, terminals):
result[terminal.name] = callback
return result

class PostLexConnector:
def __init__(self, lexer, postlexer):
self.lexer = lexer
self.postlexer = postlexer

def make_lexer_state(self, text):
return self.lexer.make_lexer_state(text)

def lex(self, lexer_state, parser_state):
i = self.lexer.lex(lexer_state, parser_state)
return self.postlexer.process(i)


class WithLexer(_ParserFrontend):
lexer = None
@@ -106,13 +124,14 @@ class WithLexer(_ParserFrontend):
def _serialize(self, data, memo):
data['parser'] = data['parser'].serialize(memo)

def lex(self, *args):
stream = self.lexer.lex(*args)
return self.postlex.process(stream) if self.postlex else stream
def make_lexer(self, text):
lexer = self.lexer
if self.postlex:
lexer = PostLexConnector(self.lexer, self.postlex)
return LexerThread(lexer, text)

def parse(self, text, start=None):
token_stream = self.lex(text)
return self._parse(token_stream, start)
return self._parse(start, self.make_lexer(text))

def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf)
@@ -138,14 +157,6 @@ class LALR_ContextualLexer(LALR_WithLexer):
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)


def parse(self, text, start=None):
parser_state = [None]
def set_parser_state(s):
parser_state[0] = s

token_stream = self.lex(text, lambda: parser_state[0])
return self._parse(token_stream, start, set_parser_state)
###}

class LALR_CustomLexer(LALR_WithLexer):
@@ -156,15 +167,6 @@ class LALR_CustomLexer(LALR_WithLexer):
WithLexer.__init__(self, lexer_conf, parser_conf, options)


def tokenize_text(text):
line = 1
col_start_pos = 0
for i, ch in enumerate(text):
if '\n' in ch:
line += ch.count('\n')
col_start_pos = i + ch.rindex('\n')
yield Token('CHAR', ch, line=line, column=i - col_start_pos)

class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
@@ -175,6 +177,9 @@ class Earley(WithLexer):
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class)

def make_lexer(self, text):
return WithLexer.make_lexer(self, text).lex(None)

def match(self, term, token):
return term.name == token.type

@@ -219,7 +224,7 @@ class XEarley(_ParserFrontend):
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)

def parse(self, text, start):
return self._parse(text, start)
return self._parse(start, text)

class XEarley_CompleteLex(XEarley):
def __init__(self, *args, **kw):
@@ -239,8 +244,8 @@ class CYK(WithLexer):
self.callbacks = parser_conf.callbacks

def parse(self, text, start):
tokens = list(self.lex(text))
parse = self._parse(tokens, start)
tokens = list(self.make_lexer(text).lex(None))
parse = self._parse(start, tokens)
parse = self._transform(parse)
return parse



+ 90
- 58
lark/parsers/lalr_parser.py Bestand weergeven

@@ -2,9 +2,9 @@
"""
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
from ..exceptions import UnexpectedToken
from copy import deepcopy
from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
from ..lexer import Token
from ..utils import Enumerator, Serialize

from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_puppet import ParserPuppet
@@ -35,84 +35,116 @@ class LALR_Parser(object):
return self.parser.parse(*args)


class _Parser:
def __init__(self, parse_table, callbacks, debug=False):
self.parse_table = parse_table
self.callbacks = callbacks
self.debug = debug
class ParserState:
__slots__ = 'parse_table', 'lexer', 'callbacks', 'start', 'state_stack', 'value_stack', 'start_state', 'end_state', 'states'

def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None):
token = None
stream = iter(seq)
states = self.parse_table.states
start_state = self.parse_table.start_states[start]
end_state = self.parse_table.end_states[start]
def __init__(self, parse_table, lexer, callbacks, start, state_stack=None, value_stack=None):
self.parse_table = parse_table

state_stack = state_stack or [start_state]
value_stack = value_stack or []
self.start_state = self.parse_table.start_states[start]
self.end_state = self.parse_table.end_states[start]
self.states = self.parse_table.states

if set_state: set_state(start_state)
self.lexer = lexer
self.callbacks = callbacks
self.start = start
self.state_stack = state_stack or [self.start_state]
self.value_stack = value_stack or []

@property
def position(self):
return self.state_stack[-1]

def __copy__(self):
return type(self)(
self.parse_table,
self.lexer, # XXX copy
self.callbacks,
self.start,
list(self.state_stack),
deepcopy(self.value_stack),
)

def feed_token(self, token, is_end=False):
state_stack = self.state_stack
value_stack = self.value_stack
states = self.states

def get_action(token):
while True:
state = state_stack[-1]
try:
return states[state][token.type]
action, arg = states[state][token.type]
except KeyError:
expected = {s for s in states[state].keys() if s.isupper()}
try:
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
except NameError: # For standalone parser
puppet = None
raise UnexpectedToken(token, expected, state=state, puppet=puppet)

def reduce(rule):
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
raise UnexpectedToken(token, expected, state=state, puppet=None)

assert arg != self.end_state

if action is Shift:
# shift once and return
assert not is_end
state_stack.append(arg)
value_stack.append(token)
return arg
else:
s = []
# reduce+shift as many times as necessary
rule = arg
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []

value = self.callbacks[rule](s)

_action, new_state = states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)

if is_end and state_stack[-1] == self.end_state:
return value_stack[-1]

value = self.callbacks[rule](s)
class _Parser:
def __init__(self, parse_table, callbacks, debug=False):
self.parse_table = parse_table
self.callbacks = callbacks
self.debug = debug

_action, new_state = states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)
def parse(self, lexer, start, value_stack=None, state_stack=None):
parser_state = ParserState(self.parse_table, lexer, self.callbacks, start, state_stack, value_stack)
return self.parse_from_state(parser_state)

def parse_from_state(self, state):
# Main LALR-parser loop
try:
for token in stream:
while True:
action, arg = get_action(token)
assert arg != end_state

if action is Shift:
state_stack.append(arg)
value_stack.append(token)
if set_state: set_state(arg)
break # next token
else:
reduce(arg)
token = None
for token in state.lexer.lex(state):
state.feed_token(token)

token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(token, True)
except UnexpectedInput as e:
try:
e.puppet = ParserPuppet(self, state, state.lexer)
except NameError:
pass
if isinstance(e, UnexpectedCharacters):
s = state.lexer.state
p = s.line_ctr.char_pos
s.line_ctr.feed(s.text[p:p+1])
raise e
except Exception as e:
if self.debug:
print("")
print("STATE STACK DUMP")
print("----------------")
for i, s in enumerate(state_stack):
for i, s in enumerate(state.state_stack):
print('%d)' % i , s)
print("")

raise

token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
while True:
_action, arg = get_action(token)
assert(_action is Reduce)
reduce(arg)
if state_stack[-1] == end_state:
return value_stack[-1]

###}


+ 17
- 72
lark/parsers/lalr_puppet.py Bestand weergeven

@@ -1,10 +1,10 @@
# This module provide a LALR puppet, which is used to debugging and error handling

from copy import deepcopy
from copy import copy

from .lalr_analysis import Shift, Reduce
from .. import Token
from ..exceptions import ParseError
from ..exceptions import UnexpectedToken


class ParserPuppet(object):
@@ -12,96 +12,44 @@ class ParserPuppet(object):

For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``.
"""
def __init__(self, parser, state_stack, value_stack, start, stream, set_state):
def __init__(self, parser, parser_state, lexer_state):
self.parser = parser
self._state_stack = state_stack
self._value_stack = value_stack
self._start = start
self._stream = stream
self._set_state = set_state

self.result = None
self.parser_state = parser_state
self.lexer_state = lexer_state

def feed_token(self, token):
"""Feed the parser with a token, and advance it to the next state, as if it recieved it from the lexer.

Note that ``token`` has to be an instance of ``Token``.
"""
end_state = self.parser.parse_table.end_states[self._start]
state_stack = self._state_stack
value_stack = self._value_stack

state = state_stack[-1]
action, arg = self.parser.parse_table.states[state][token.type]
if arg == end_state:
raise ParseError(arg)

while action is Reduce:
rule = arg
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []

value = self.parser.callbacks[rule](s)

_action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)

if state_stack[-1] == end_state:
self.result = value_stack[-1]
return self.result

state = state_stack[-1]
try:
action, arg = self.parser.parse_table.states[state][token.type]
except KeyError as e:
raise ParseError(e)
assert arg != end_state

assert action is Shift
state_stack.append(arg)
value_stack.append(token)

def copy(self):
return self.parser_state.feed_token(token)

def __copy__(self):
"""Create a new puppet with a separate state.

Calls to feed_token() won't affect the old puppet, and vice-versa.
"""
return type(self)(
self.parser,
list(self._state_stack),
deepcopy(self._value_stack),
self._start,
self._stream,
self._set_state,
copy(self.parser_state),
copy(self.lexer_state),
)

def __eq__(self, other):
if not isinstance(other, ParserPuppet):
return False

return (
self._state_stack == other._state_stack and
self._value_stack == other._value_stack and
self._stream == other._stream and
self._start == other._start
)
return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state

def __hash__(self):
return hash((tuple(self._state_stack), self._start))
return hash((self.parser_state, self.lexer_state))

def pretty(self):
"""Print the output of ``choices()`` in a way that's easier to read."""
out = ["Puppet choices:"]
for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v))
out.append('stack size: %s' % len(self._state_stack))
out.append('stack size: %s' % len(self.parser_state.state_stack))
return '\n'.join(out)

def choices(self):
@@ -111,16 +59,16 @@ class ParserPuppet(object):

Updated by ``feed_token()``.
"""
return self.parser.parse_table.states[self._state_stack[-1]]
return self.parser_state.parse_table.states[self.parser_state.position]

def accepts(self):
accepts = set()
for t in self.choices():
if t.isupper(): # is terminal?
new_puppet = self.copy()
new_puppet = copy(self)
try:
new_puppet.feed_token(Token(t, ''))
except ParseError:
except UnexpectedToken:
pass
else:
accepts.add(t)
@@ -128,7 +76,4 @@ class ParserPuppet(object):

def resume_parse(self):
"""Resume parsing from the current puppet state."""
return self.parser.parse(
self._stream, self._start, self._set_state,
self._value_stack, self._state_stack
)
return self.parser.parse_from_state(self.parser_state)

+ 36
- 0
tests/test_parser.py Bestand weergeven

@@ -2217,6 +2217,42 @@ def _make_parser_test(LEXER, PARSER):
""", regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')


@unittest.skipIf(PARSER!='lalr', "Puppet error handling only works with LALR for now")
def test_error_with_puppet(self):
def ignore_errors(e):
if isinstance(e, UnexpectedCharacters):
# Skip bad character
return True

# Must be UnexpectedToken
if e.token.type == 'COMMA':
# Skip comma
return True
elif e.token.type == 'SIGNED_NUMBER':
# Try to feed a comma and retry the number
e.puppet.feed_token(Token('COMMA', ','))
e.puppet.feed_token(e.token)
return True

# Unhandled error. Will stop parse and raise exception
return False

g = _Lark(r'''
start: "[" num ("," num)* "]"
?num: SIGNED_NUMBER
%import common.SIGNED_NUMBER
%ignore " "
''')
s = "[0 1, 2,, 3,,, 4, 5 6 ]"
tree = g.parse(s, on_error=ignore_errors)
res = [int(x) for x in tree.children]
assert res == list(range(7))

s = "[0 1, 2,@, 3,,, 4, 5 6 ]$"
tree = g.parse(s, on_error=ignore_errors)


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME
_TestParser.__qualname__ = "tests.test_parser." + _NAME


Laden…
Annuleren
Opslaan