Quellcode durchsuchen

Merge pull request #877 from lark-parser/interactive_parser

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.3
Erez Shinan vor 3 Jahren
committed by GitHub
Ursprung
Commit
672dfed7ce
Es konnte kein GPG-Schlüssel zu dieser Signatur gefunden werden GPG-Schlüssel-ID: 4AEE18F83AFDEB23
14 geänderte Dateien mit 254 neuen und 169 gelöschten Zeilen
  1. +6
    -5
      docs/classes.rst
  2. +1
    -1
      docs/features.md
  3. +6
    -6
      examples/advanced/error_handling.py
  4. +2
    -2
      lark-stubs/exceptions.pyi
  5. +8
    -1
      lark-stubs/lark.pyi
  6. +0
    -22
      lark-stubs/parsers/lalr_puppet.pyi
  7. +17
    -7
      lark/exceptions.py
  8. +4
    -1
      lark/lark.py
  9. +6
    -6
      lark/load_grammar.py
  10. +14
    -3
      lark/parser_frontends.py
  11. +132
    -0
      lark/parsers/lalr_interactive_parser.py
  12. +15
    -7
      lark/parsers/lalr_parser.py
  13. +3
    -96
      lark/parsers/lalr_puppet.py
  14. +40
    -12
      tests/test_parser.py

+ 6
- 5
docs/classes.rst Datei anzeigen

@@ -66,10 +66,11 @@ UnexpectedInput

.. autoclass:: lark.exceptions.UnexpectedCharacters

.. _parserpuppet:
InteractiveParser
-----------------

ParserPuppet
------------
.. autoclass:: lark.parsers.lalr_interactive_parser.InteractiveParser
:members: choices, feed_token, copy, pretty, resume_parse, exhaust_lexer, accepts

.. autoclass:: lark.parsers.lalr_puppet.ParserPuppet
:members: choices, feed_token, copy, pretty, resume_parse
.. autoclass:: lark.parsers.lalr_interactive_parser.ImmutableInteractiveParser
:members: choices, feed_token, copy, pretty, resume_parse, exhaust_lexer, accepts

+ 1
- 1
docs/features.md Datei anzeigen

@@ -8,7 +8,7 @@
- EBNF-inspired grammar, with extra features (See: [Grammar Reference](grammar.md))
- Builds a parse-tree (AST) automagically based on the grammar
- Stand-alone parser generator - create a small independent parser to embed in your project.
- Flexible error handling by using a "puppet parser" mechanism (LALR only)
- Flexible error handling by using an interactive parser interface (LALR only)
- Automatic line & column tracking (for both tokens and matched rules)
- Automatic terminal collision resolution
- Standard library of terminals (strings, numbers, names, etc.)


examples/advanced/error_puppet.py → examples/advanced/error_handling.py Datei anzeigen

@@ -1,11 +1,11 @@
"""
Error handling with a puppet
==================================
Error handling using an interactive parser
==========================================

This example demonstrates error handling using a parsing puppet in LALR
This example demonstrates error handling using an interactive parser in LALR

When the parser encounters an UnexpectedToken exception, it creates a
parsing puppet with the current parse-state, and lets you control how
an interactive parser with the current parse-state, and lets you control how
to proceed step-by-step. When you've achieved the correct parse-state,
you can resume the run by returning True.
"""
@@ -20,8 +20,8 @@ def ignore_errors(e):
return True
elif e.token.type == 'SIGNED_NUMBER':
# Try to feed a comma and retry the number
e.puppet.feed_token(Token('COMMA', ','))
e.puppet.feed_token(e.token)
e.interactive_parser.feed_token(Token('COMMA', ','))
e.interactive_parser.feed_token(e.token)
return True

# Unhandled error. Will stop parse and raise exception

+ 2
- 2
lark-stubs/exceptions.pyi Datei anzeigen

@@ -3,7 +3,7 @@
from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set
from .tree import Tree
from .lexer import Token
from .parsers.lalr_puppet import ParserPuppet
from .parsers.lalr_interactive_parser import InteractiveParser

class LarkError(Exception):
pass
@@ -52,7 +52,7 @@ class UnexpectedInput(LarkError):
class UnexpectedToken(ParseError, UnexpectedInput):
expected: Set[str]
considered_rules: Set[str]
puppet: ParserPuppet
interactive_parser: InteractiveParser
accepts: Set[str]

class UnexpectedCharacters(LexError, UnexpectedInput):


+ 8
- 1
lark-stubs/lark.pyi Datei anzeigen

@@ -2,8 +2,10 @@

from typing import (
TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional,
Literal, Protocol, Tuple, Iterable,
Literal, Protocol, Tuple, Iterable,
)

from .parsers.lalr_interactive_parser import InteractiveParser
from .visitors import Transformer
from .lexer import Token, Lexer, TerminalDef
from .tree import Tree
@@ -12,6 +14,7 @@ from .load_grammar import Grammar

_T = TypeVar('_T')


class PostLex(Protocol):

def process(self, stream: Iterator[Token]) -> Iterator[Token]:
@@ -46,6 +49,7 @@ class PackageResource(object):

def __init__(self, pkg_name: str, path: str): ...


class FromPackageLoader:
def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ...

@@ -87,6 +91,9 @@ class Lark:
def parse(self, text: str, start: Optional[str] = None, on_error: Callable[[UnexpectedInput], bool] = None) -> Tree:
...

def parse_interactive(self, text: str = None, start: Optional[str] = None) -> InteractiveParser:
...

@classmethod
def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T:
...


+ 0
- 22
lark-stubs/parsers/lalr_puppet.pyi Datei anzeigen

@@ -1,22 +0,0 @@
from typing import Set, Dict, Any

from lark import Token, Tree


class ParserPuppet(object):
"""
Provides an interface to interactively step through the parser (LALR(1) only for now)

Accessible via `UnexpectedToken.puppet` (raised by the parser on token error)
"""
def feed_token(self, token: Token) -> Any: ...

def copy(self) -> ParserPuppet: ...

def pretty(self) -> str: ...

def choices(self) -> Dict[str, Any]: ...

def accepts(self) -> Set[str]: ...

def resume_parse(self) -> Tree: ...

+ 17
- 7
lark/exceptions.py Datei anzeigen

@@ -1,3 +1,5 @@
from warnings import warn

from .utils import STRING_TYPE, logger, NO_VALUE


@@ -177,14 +179,16 @@ class UnexpectedCharacters(LexError, UnexpectedInput):


class UnexpectedToken(ParseError, UnexpectedInput):
"""When the parser throws UnexpectedToken, it instantiates a puppet
with its internal state. Users can then interactively set the puppet to
the desired puppet state, and resume regular parsing.
"""An exception that is raised by the parser, when the token it received
doesn't match any valid step forward.

The parser provides an interactive instance through `interactive_parser`,
which is initialized to the point of failture, and can be used for debugging and error handling.

see: :ref:`ParserPuppet`.
see: ``InteractiveParser``.
"""

def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, terminals_by_name=None, token_history=None):
def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None):
# TODO considered_rules and expected can be figured out using state
self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?')
@@ -195,7 +199,7 @@ class UnexpectedToken(ParseError, UnexpectedInput):
self.expected = expected # XXX deprecate? `accepts` is better
self._accepts = NO_VALUE
self.considered_rules = considered_rules
self.puppet = puppet
self.interactive_parser = interactive_parser
self._terminals_by_name = terminals_by_name
self.token_history = token_history

@@ -204,7 +208,7 @@ class UnexpectedToken(ParseError, UnexpectedInput):
@property
def accepts(self):
if self._accepts is NO_VALUE:
self._accepts = self.puppet and self.puppet.accepts()
self._accepts = self.interactive_parser and self.interactive_parser.accepts()
return self._accepts

def __str__(self):
@@ -215,6 +219,12 @@ class UnexpectedToken(ParseError, UnexpectedInput):

return message

@property
def puppet(self):
warn("UnexpectedToken.puppet attribute has been renamed to interactive_parser", DeprecationWarning)
return self.interactive_parser


class VisitError(LarkError):
"""VisitError is raised when visitors are interrupted by an exception


+ 4
- 1
lark/lark.py Datei anzeigen

@@ -531,6 +531,9 @@ class Lark(Serialize):
def get_terminal(self, name):
"Get information about a terminal"
return self._terminals_dict[name]
def parse_interactive(self, text=None, start=None):
return self.parser.parse_interactive(text, start=start)

def parse(self, text, start=None, on_error=None):
"""Parse the given text, according to the options provided.
@@ -539,7 +542,7 @@ class Lark(Serialize):
text (str): Text to be parsed.
start (str, optional): Required if Lark was given multiple possible start symbols (using the start option).
on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing.
LALR only. See examples/advanced/error_puppet.py for an example of how to use on_error.
LALR only. See examples/advanced/error_handling.py for an example of how to use on_error.

Returns:
If a transformer is supplied to ``__init__``, returns whatever is the


+ 6
- 6
lark/load_grammar.py Datei anzeigen

@@ -866,7 +866,7 @@ def _error_repr(error):
else:
return str(error)

def _search_puppet(puppet, predicate):
def _search_interactive_parser(interactive_parser, predicate):
def expand(node):
path, p = node
for choice in p.choices():
@@ -878,7 +878,7 @@ def _search_puppet(puppet, predicate):
else:
yield path + (choice,), new_p

for path, p in bfs_all_unique([((), puppet)], expand):
for path, p in bfs_all_unique([((), interactive_parser)], expand):
if predicate(p):
return path, p

@@ -888,10 +888,10 @@ def find_grammar_errors(text, start='start'):
errors.append((e, _error_repr(e)))

# recover to a new line
token_path, _ = _search_puppet(e.puppet.as_immutable(), lambda p: '_NL' in p.choices())
token_path, _ = _search_interactive_parser(e.interactive_parser.as_immutable(), lambda p: '_NL' in p.choices())
for token_type in token_path:
e.puppet.feed_token(Token(token_type, ''))
e.puppet.feed_token(Token('_NL', '\n'))
e.interactive_parser.feed_token(Token(token_type, ''))
e.interactive_parser.feed_token(Token('_NL', '\n'))
return True

_tree = _get_parser().parse(text + '\n', start, on_error=on_error)
@@ -900,7 +900,7 @@ def find_grammar_errors(text, start='start'):
errors = [el[0] for el in errors_by_line.values()] # already sorted

for e in errors:
e[0].puppet = None
e[0].interactive_parser = None
return errors




+ 14
- 3
lark/parser_frontends.py Datei anzeigen

@@ -89,18 +89,29 @@ class ParsingFrontend(Serialize):

if lexer_conf.postlex:
self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)


def parse(self, text, start=None, on_error=None):
def _verify_start(self, start=None):
if start is None:
start = self.parser_conf.start
if len(start) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start
elif start not in self.parser_conf.start:
raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
return start

def parse(self, text, start=None, on_error=None):
start = self._verify_start(start)
stream = text if self.skip_lexer else LexerThread(self.lexer, text)
kw = {} if on_error is None else {'on_error': on_error}
return self.parser.parse(stream, start, **kw)
def parse_interactive(self, text=None, start=None):
start = self._verify_start(start)
if self.parser_conf.parser_type != 'lalr':
raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
stream = text if self.skip_lexer else LexerThread(self.lexer, text)
return self.parser.parse_interactive(stream, start)


def get_frontend(parser, lexer):


+ 132
- 0
lark/parsers/lalr_interactive_parser.py Datei anzeigen

@@ -0,0 +1,132 @@
# This module provides a LALR interactive parser, which is used for debugging and error handling

from copy import copy

from .. import Token
from ..exceptions import UnexpectedToken


class InteractiveParser(object):
"""InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR.

For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``.
"""
def __init__(self, parser, parser_state, lexer_state):
self.parser = parser
self.parser_state = parser_state
self.lexer_state = lexer_state

def feed_token(self, token):
"""Feed the parser with a token, and advance it to the next state, as if it received it from the lexer.

Note that ``token`` has to be an instance of ``Token``.
"""
return self.parser_state.feed_token(token, token.type == '$END')
def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the interactive parser.
Note that this modifies the instance in place and does not feed an '$END' Token"""
for token in self.lexer_state.lex(self.parser_state):
self.parser_state.feed_token(token)
def feed_eof(self, last_token=None):
"""Feed a '$END' Token. Borrows from 'last_token' if given."""
eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else Token('$END', '', 0, 1, 1)
return self.feed_token(eof)


def __copy__(self):
"""Create a new interactive parser with a separate state.

Calls to feed_token() won't affect the old instance, and vice-versa.
"""
return type(self)(
self.parser,
copy(self.parser_state),
copy(self.lexer_state),
)

def copy(self):
return copy(self)

def __eq__(self, other):
if not isinstance(other, InteractiveParser):
return False

return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state

def as_immutable(self):
"""Convert to an ``ImmutableInteractiveParser``."""
p = copy(self)
return ImmutableInteractiveParser(p.parser, p.parser_state, p.lexer_state)

def pretty(self):
"""Print the output of ``choices()`` in a way that's easier to read."""
out = ["Parser choices:"]
for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v))
out.append('stack size: %s' % len(self.parser_state.state_stack))
return '\n'.join(out)

def choices(self):
"""Returns a dictionary of token types, matched to their action in the parser.

Only returns token types that are accepted by the current state.

Updated by ``feed_token()``.
"""
return self.parser_state.parse_conf.parse_table.states[self.parser_state.position]

def accepts(self):
"""Returns the set of possible tokens that will advance the parser into a new valid state."""
accepts = set()
for t in self.choices():
if t.isupper(): # is terminal?
new_cursor = copy(self)
try:
new_cursor.feed_token(Token(t, ''))
except UnexpectedToken:
pass
else:
accepts.add(t)
return accepts

def resume_parse(self):
"""Resume automated parsing from the current state."""
return self.parser.parse_from_state(self.parser_state)



class ImmutableInteractiveParser(InteractiveParser):
"""Same as ``InteractiveParser``, but operations create a new instance instead
of changing it in-place.
"""

result = None

def __hash__(self):
return hash((self.parser_state, self.lexer_state))

def feed_token(self, token):
c = copy(self)
c.result = InteractiveParser.feed_token(c, token)
return c

def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the parser.

Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token"""
cursor = self.as_mutable()
cursor.exhaust_lexer()
return cursor.as_immutable()

def as_mutable(self):
"""Convert to an ``InteractiveParser``."""
p = copy(self)
return InteractiveParser(p.parser, p.parser_state, p.lexer_state)


# Deprecated class names for the interactive parser
ParserPuppet = InteractiveParser
ImmutableParserPuppet = ImmutableInteractiveParser

+ 15
- 7
lark/parsers/lalr_parser.py Datei anzeigen

@@ -8,7 +8,7 @@ from ..lexer import Token
from ..utils import Serialize

from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_puppet import ParserPuppet
from .lalr_interactive_parser import InteractiveParser
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken

###{standalone
@@ -32,6 +32,9 @@ class LALR_Parser(Serialize):

def serialize(self, memo):
return self._parse_table.serialize(memo)
def parse_interactive(self, lexer, start):
return self.parser.parse(lexer, start, start_interactive=True)

def parse(self, lexer, start, on_error=None):
try:
@@ -42,7 +45,7 @@ class LALR_Parser(Serialize):

while True:
if isinstance(e, UnexpectedCharacters):
s = e.puppet.lexer_state.state
s = e.interactive_parser.lexer_state.state
p = s.line_ctr.char_pos

if not on_error(e):
@@ -54,9 +57,11 @@ class LALR_Parser(Serialize):
s.line_ctr.feed(s.text[p:p+1])

try:
return e.puppet.resume_parse()
return e.interactive_parser.resume_parse()
except UnexpectedToken as e2:
if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet:
if (isinstance(e, UnexpectedToken)
and e.token.type == e2.token.type == '$END'
and e.interactive_parser == e2.interactive_parser):
# Prevent infinite loop
raise e2
e = e2
@@ -121,7 +126,7 @@ class ParserState(object):
action, arg = states[state][token.type]
except KeyError:
expected = {s for s in states[state].keys() if s.isupper()}
raise UnexpectedToken(token, expected, state=self, puppet=None)
raise UnexpectedToken(token, expected, state=self, interactive_parser=None)

assert arg != end_state

@@ -158,10 +163,13 @@ class _Parser(object):
self.callbacks = callbacks
self.debug = debug

def parse(self, lexer, start, value_stack=None, state_stack=None):
def parse(self, lexer, start, value_stack=None, state_stack=None, start_interactive=False):
parse_conf = ParseConf(self.parse_table, self.callbacks, start)
parser_state = ParserState(parse_conf, lexer, state_stack, value_stack)
if start_interactive:
return InteractiveParser(self, parser_state, parser_state.lexer)
return self.parse_from_state(parser_state)

def parse_from_state(self, state):
# Main LALR-parser loop
@@ -174,7 +182,7 @@ class _Parser(object):
return state.feed_token(token, True)
except UnexpectedInput as e:
try:
e.puppet = ParserPuppet(self, state, state.lexer)
e.interactive_parser = InteractiveParser(self, state, state.lexer)
except NameError:
pass
raise e


+ 3
- 96
lark/parsers/lalr_puppet.py Datei anzeigen

@@ -1,96 +1,3 @@
# This module provide a LALR puppet, which is used to debugging and error handling

from copy import copy

from .lalr_analysis import Shift, Reduce
from .. import Token
from ..exceptions import UnexpectedToken


class ParserPuppet(object):
"""ParserPuppet gives you advanced control over error handling when parsing with LALR.

For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``.
"""
def __init__(self, parser, parser_state, lexer_state):
self.parser = parser
self.parser_state = parser_state
self.lexer_state = lexer_state

def feed_token(self, token):
"""Feed the parser with a token, and advance it to the next state, as if it received it from the lexer.

Note that ``token`` has to be an instance of ``Token``.
"""
return self.parser_state.feed_token(token, token.type == '$END')

def __copy__(self):
"""Create a new puppet with a separate state.

Calls to feed_token() won't affect the old puppet, and vice-versa.
"""
return type(self)(
self.parser,
copy(self.parser_state),
copy(self.lexer_state),
)

def copy(self):
return copy(self)

def __eq__(self, other):
if not isinstance(other, ParserPuppet):
return False

return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state

def as_immutable(self):
p = copy(self)
return ImmutableParserPuppet(p.parser, p.parser_state, p.lexer_state)

def pretty(self):
"""Print the output of ``choices()`` in a way that's easier to read."""
out = ["Puppet choices:"]
for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v))
out.append('stack size: %s' % len(self.parser_state.state_stack))
return '\n'.join(out)

def choices(self):
"""Returns a dictionary of token types, matched to their action in the parser.

Only returns token types that are accepted by the current state.

Updated by ``feed_token()``.
"""
return self.parser_state.parse_conf.parse_table.states[self.parser_state.position]

def accepts(self):
accepts = set()
for t in self.choices():
if t.isupper(): # is terminal?
new_puppet = copy(self)
try:
new_puppet.feed_token(Token(t, ''))
except UnexpectedToken:
pass
else:
accepts.add(t)
return accepts

def resume_parse(self):
"""Resume parsing from the current puppet state."""
return self.parser.parse_from_state(self.parser_state)



class ImmutableParserPuppet(ParserPuppet):
result = None

def __hash__(self):
return hash((self.parser_state, self.lexer_state))

def feed_token(self, token):
c = copy(self)
c.result = ParserPuppet.feed_token(c, token)
return c
# Deprecated
from .lalr_interactive_parser import ParserPuppet, ImmutableParserPuppet

+ 40
- 12
tests/test_parser.py Datei anzeigen

@@ -2395,9 +2395,45 @@ def _make_parser_test(LEXER, PARSER):
""", regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

@unittest.skipIf(PARSER!='lalr', "interactive_parser is only implemented for LALR at the moment")
def test_parser_interactive_parser(self):

@unittest.skipIf(PARSER!='lalr', "Puppet error handling only works with LALR for now")
def test_error_with_puppet(self):
g = _Lark(r'''
start: A+ B*
A: "a"
B: "b"
''')
ip = g.parse_interactive()

self.assertRaises(UnexpectedToken, ip.feed_eof)
self.assertRaises(TypeError, ip.exhaust_lexer)
ip.feed_token(Token('A', 'a'))
res = ip.feed_eof()
self.assertEqual(res, Tree('start', ['a']))

ip = g.parse_interactive("ab")

ip.exhaust_lexer()

ip_copy = ip.copy()
self.assertEqual(ip_copy.parser_state, ip.parser_state)
self.assertEqual(ip_copy.lexer_state.state, ip.lexer_state.state)
self.assertIsNot(ip_copy.parser_state, ip.parser_state)
self.assertIsNot(ip_copy.lexer_state.state, ip.lexer_state.state)
self.assertIsNot(ip_copy.lexer_state.state.line_ctr, ip.lexer_state.state.line_ctr)

res = ip.feed_eof(ip.lexer_state.state.last_token)
self.assertEqual(res, Tree('start', ['a', 'b']))
self.assertRaises(UnexpectedToken ,ip.feed_eof)
self.assertRaises(UnexpectedToken, ip_copy.feed_token, Token('A', 'a'))
ip_copy.feed_token(Token('B', 'b'))
res = ip_copy.feed_eof()
self.assertEqual(res, Tree('start', ['a', 'b', 'b']))

@unittest.skipIf(PARSER!='lalr', "interactive_parser error handling only works with LALR for now")
def test_error_with_interactive_parser(self):
def ignore_errors(e):
if isinstance(e, UnexpectedCharacters):
# Skip bad character
@@ -2408,17 +2444,9 @@ def _make_parser_test(LEXER, PARSER):
# Skip comma
return True
elif e.token.type == 'SIGNED_NUMBER':
# Make a copy and ensure it is properly made
puppet_copy = e.puppet.copy()
assert puppet_copy.parser_state == e.puppet.parser_state
assert puppet_copy.lexer_state.state == e.puppet.lexer_state.state
assert puppet_copy.parser_state is not e.puppet.parser_state
assert puppet_copy.lexer_state.state is not e.puppet.lexer_state.state
assert puppet_copy.lexer_state.state.line_ctr is not e.puppet.lexer_state.state.line_ctr

# Try to feed a comma and retry the number
e.puppet.feed_token(Token('COMMA', ','))
e.puppet.feed_token(e.token)
e.interactive_parser.feed_token(Token('COMMA', ','))
e.interactive_parser.feed_token(e.token)

return True



Laden…
Abbrechen
Speichern