Browse Source

Merge branch 'MegaIng-error-handling'

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.10.0
Erez Sh 4 years ago
parent
commit
c9ca287e9e
9 changed files with 144 additions and 42 deletions
  1. +1
    -1
      examples/error_reporting_lalr.py
  2. +20
    -10
      lark-stubs/exceptions.pyi
  3. +0
    -0
      lark-stubs/parsers/__init__.pyi
  4. +22
    -0
      lark-stubs/parsers/lalr_puppet.pyi
  5. +36
    -16
      lark/exceptions.py
  6. +15
    -6
      lark/load_grammar.py
  7. +2
    -2
      lark/parsers/lalr_parser.py
  8. +22
    -7
      lark/parsers/lalr_puppet.py
  9. +26
    -0
      tests/test_parser.py

+ 1
- 1
examples/error_reporting_lalr.py View File

@@ -52,7 +52,7 @@ def parse(json_text):
'[1,2,]',
'{"foo":1,}',
'{"foo":false,"bar":true,}']
})
}, use_accepts=True)
if not exc_class:
raise
raise exc_class(u.get_context(json_text), u.line, u.column)


+ 20
- 10
lark-stubs/exceptions.pyi View File

@@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-

from typing import Dict, Iterable, Callable, Union
from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set
from .tree import Tree
from .lexer import Token
from .parsers.lalr_puppet import ParserPuppet

class LarkError(Exception):
pass
@@ -21,27 +21,37 @@ class LexError(LarkError):
pass


T = TypeVar('T')


class UnexpectedInput(LarkError):
line: int
column: int
pos_in_stream: int
state: Any

def get_context(self, text: str, span: int = ...):
...

def match_examples(
self,
parse_fn: Callable[[str], Tree],
examples: Dict[str, Iterable[str]]
):
self,
parse_fn: Callable[[str], Tree],
examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]],
token_type_match_fallback: bool = False,
use_accepts: bool = False,
) -> T:
...


class UnexpectedToken(ParseError, UnexpectedInput):
pass

expected: Set[str]
considered_rules: Set[str]
puppet: ParserPuppet
accepts: Set[str]

class UnexpectedCharacters(LexError, UnexpectedInput):
line: int
column: int
allowed: Set[str]
considered_tokens: Set[Any]


class VisitError(LarkError):


+ 0
- 0
lark-stubs/parsers/__init__.pyi View File


+ 22
- 0
lark-stubs/parsers/lalr_puppet.pyi View File

@@ -0,0 +1,22 @@
from typing import Set, Dict, Any

from lark import Token, Tree


class ParserPuppet(object):
"""
Provides an interface to interactively step through the parser (LALR(1) only for now)

Accessible via `UnexpectedToken.puppet` (raised by the parser on token error)
"""
def feed_token(self, token: Token): ...

def copy(self) -> ParserPuppet: ...

def pretty(self) -> str: ...

def choices(self) -> Dict[str, Any]: ...

def accepts(self) -> Set[str]: ...

def resume_parse(self) -> Tree: ...

+ 36
- 16
lark/exceptions.py View File

@@ -1,3 +1,5 @@
import logging

from .utils import STRING_TYPE

###{standalone
@@ -37,34 +39,46 @@ class UnexpectedInput(LarkError):
after = text[pos:end].split(b'\n', 1)[0]
return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace")

def match_examples(self, parse_fn, examples, token_type_match_fallback=False):
def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False):
""" Given a parser instance and a dictionary mapping some label with
some malformed syntax examples, it'll return the label for the
example that bests matches the current error.

It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility.
"""
assert self.state is not None, "Not supported for this exception"

if isinstance(examples, dict):
examples = examples.items()

candidate = (None, False)
for label, example in examples.items():
for i, (label, example) in enumerate(examples):
assert not isinstance(example, STRING_TYPE)

for malformed in example:
for j, malformed in enumerate(example):
try:
parse_fn(malformed)
except UnexpectedInput as ut:
if ut.state == self.state:
if use_accepts and ut.accepts != self.accepts:
logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
(self.state, self.accepts, ut.accepts, i, j))
continue
try:
if ut.token == self.token: # Try exact match first
logging.debug("Exact Match at example [%s][%s]" % (i, j))
return label

if token_type_match_fallback:
# Fallback to token types match
if (ut.token.type == self.token.type) and not candidate[-1]:
logging.debug("Token Type Fallback at example [%s][%s]" % (i, j))
candidate = label, True

except AttributeError:
pass
if not candidate[0]:
logging.debug("Same State match at example [%s][%s]" % (i, j))
candidate = label, False

return candidate[0]
@@ -72,19 +86,20 @@ class UnexpectedInput(LarkError):

class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):

if isinstance(seq, bytes):
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column)
else:
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)

self.line = line
self.column = column
self.allowed = allowed
self.considered_tokens = considered_tokens
self.pos_in_stream = lex_pos
self.state = state

self.allowed = allowed
self.considered_tokens = considered_tokens

if isinstance(seq, bytes):
_s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
else:
_s = seq[lex_pos]

message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column)
message += '\n\n' + self.get_context(seq)
if allowed:
message += '\nExpecting: %s\n' % allowed
@@ -97,18 +112,23 @@ class UnexpectedCharacters(LexError, UnexpectedInput):

class UnexpectedToken(ParseError, UnexpectedInput):
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
self.token = token
self.expected = expected # XXX str shouldn't necessary
self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?')
self.considered_rules = considered_rules
self.state = state
self.pos_in_stream = getattr(token, 'pos_in_stream', None)
self.state = state

self.token = token
self.expected = expected # XXX deprecate? `accepts` is better
self.considered_rules = considered_rules
self.puppet = puppet

# TODO Only calculate `accepts()` when we need to display it to the user
# This will improve performance when doing automatic error handling
self.accepts = puppet and puppet.accepts()

message = ("Unexpected token %r at line %s, column %s.\n"
"Expected one of: \n\t* %s\n"
% (token, self.line, self.column, '\n\t* '.join(self.expected)))
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))

super(UnexpectedToken, self).__init__(message)



+ 15
- 6
lark/load_grammar.py View File

@@ -85,7 +85,7 @@ TERMINALS = {
'RULE': '!?[_?]?[a-z][_a-z0-9]*',
'TERMINAL': '_?[A-Z][_A-Z0-9]*',
'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS,
'_NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+',
'COMMENT': r'\s*//[^\n]*',
@@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
term_name = None

elif isinstance(p, PatternRE):
if p in self.term_reverse: # Kind of a wierd placement.name
if p in self.term_reverse: # Kind of a weird placement.name
term_name = self.term_reverse[p].name
else:
assert False, p
@@ -409,6 +409,13 @@ def _literal_to_pattern(literal):
flags = v[flag_start:]
assert all(f in _RE_FLAGS for f in flags), flags

if literal.type == 'STRING' and '\n' in v:
raise GrammarError('You cannot put newlines in string literals')

if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags:
raise GrammarError('You can only use newlines in regular expressions '
'with the `x` (verbose) flag')

v = v[:flag_start]
assert v[0] == v[-1] and v[0] in '"/'
x = v[1:-1]
@@ -417,9 +424,11 @@ def _literal_to_pattern(literal):

if literal.type == 'STRING':
s = s.replace('\\\\', '\\')

return { 'STRING': PatternStr,
'REGEXP': PatternRE }[literal.type](s, flags)
return PatternStr(s, flags)
elif literal.type == 'REGEXP':
return PatternRE(s, flags)
else:
assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]'


@inline_args
@@ -841,7 +850,7 @@ class GrammarLoader:
if len(stmt.children) > 1:
path_node, arg1 = stmt.children
else:
path_node, = stmt.children
path_node ,= stmt.children
arg1 = None

if isinstance(arg1, Tree): # Multi import


+ 2
- 2
lark/parsers/lalr_parser.py View File

@@ -59,10 +59,10 @@ class _Parser:
try:
return states[state][token.type]
except KeyError:
expected = [s for s in states[state].keys() if s.isupper()]
expected = {s for s in states[state].keys() if s.isupper()}
try:
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
except NameError:
except NameError: # For standalone parser
puppet = None
raise UnexpectedToken(token, expected, state=state, puppet=puppet)



+ 22
- 7
lark/parsers/lalr_puppet.py View File

@@ -3,8 +3,10 @@
from copy import deepcopy

from .lalr_analysis import Shift, Reduce
from .. import Token

class ParserPuppet:

class ParserPuppet(object):
def __init__(self, parser, state_stack, value_stack, start, stream, set_state):
self.parser = parser
self._state_stack = state_stack
@@ -16,7 +18,7 @@ class ParserPuppet:
self.result = None

def feed_token(self, token):
"""Advance the parser state, as if it just recieved `token` from the lexer
"""Advance the parser state, as if it just received `token` from the lexer

"""
end_state = self.parser.parse_table.end_states[self._start]
@@ -66,14 +68,27 @@ class ParserPuppet:
self._set_state,
)

def pretty():
print("Puppet choices:")
for k, v in self.choices.items():
print('\t-', k, '->', v)
print('stack size:', len(self._state_stack))
def pretty(self):
out = ["Puppet choices:"]
for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v))
out.append('stack size: %s' % len(self._state_stack))
return '\n'.join(out)

def choices(self):
return self.parser.parse_table.states[self._state_stack[-1]]

def accepts(self):
accepts = set()
for t in self.choices():
new_puppet = self.copy()
try:
new_puppet.feed_token(Token(t, ''))
except KeyError:
pass
else:
accepts.add(t)
return accepts

def resume_parse(self):
return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack)

+ 26
- 0
tests/test_parser.py View File

@@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER):
tree = l.parse('aA')
self.assertEqual(tree.children, ['a', 'A'])

def test_token_flags_verbose(self):
g = _Lark(r"""start: NL | ABC
ABC: / [a-z] /x
NL: /\n/
""")
x = g.parse('a')
self.assertEqual(x.children, ['a'])

def test_token_flags_verbose_multiline(self):
g = _Lark(r"""start: ABC
ABC: / a b c
d
e f
/x
""")
x = g.parse('abcdef')
self.assertEqual(x.children, ['abcdef'])

def test_token_multiline_only_works_with_x_flag(self):
g = r"""start: ABC
ABC: / a b c
d
e f
/i
"""
self.assertRaises( GrammarError, _Lark, g)

@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_twice_empty(self):


Loading…
Cancel
Save