Browse Source

Merge branch 'MegaIng-error-handling'

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.10.0
Erez Sh 4 years ago
parent
commit
c9ca287e9e
9 changed files with 144 additions and 42 deletions
  1. +1
    -1
      examples/error_reporting_lalr.py
  2. +20
    -10
      lark-stubs/exceptions.pyi
  3. +0
    -0
      lark-stubs/parsers/__init__.pyi
  4. +22
    -0
      lark-stubs/parsers/lalr_puppet.pyi
  5. +36
    -16
      lark/exceptions.py
  6. +15
    -6
      lark/load_grammar.py
  7. +2
    -2
      lark/parsers/lalr_parser.py
  8. +22
    -7
      lark/parsers/lalr_puppet.py
  9. +26
    -0
      tests/test_parser.py

+ 1
- 1
examples/error_reporting_lalr.py View File

@@ -52,7 +52,7 @@ def parse(json_text):
'[1,2,]', '[1,2,]',
'{"foo":1,}', '{"foo":1,}',
'{"foo":false,"bar":true,}'] '{"foo":false,"bar":true,}']
})
}, use_accepts=True)
if not exc_class: if not exc_class:
raise raise
raise exc_class(u.get_context(json_text), u.line, u.column) raise exc_class(u.get_context(json_text), u.line, u.column)


+ 20
- 10
lark-stubs/exceptions.pyi View File

@@ -1,9 +1,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-


from typing import Dict, Iterable, Callable, Union
from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set
from .tree import Tree from .tree import Tree
from .lexer import Token from .lexer import Token
from .parsers.lalr_puppet import ParserPuppet


class LarkError(Exception): class LarkError(Exception):
pass pass
@@ -21,27 +21,37 @@ class LexError(LarkError):
pass pass




T = TypeVar('T')


class UnexpectedInput(LarkError): class UnexpectedInput(LarkError):
line: int
column: int
pos_in_stream: int pos_in_stream: int
state: Any


def get_context(self, text: str, span: int = ...): def get_context(self, text: str, span: int = ...):
... ...


def match_examples( def match_examples(
self,
parse_fn: Callable[[str], Tree],
examples: Dict[str, Iterable[str]]
):
self,
parse_fn: Callable[[str], Tree],
examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]],
token_type_match_fallback: bool = False,
use_accepts: bool = False,
) -> T:
... ...




class UnexpectedToken(ParseError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput):
pass

expected: Set[str]
considered_rules: Set[str]
puppet: ParserPuppet
accepts: Set[str]


class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput):
line: int
column: int
allowed: Set[str]
considered_tokens: Set[Any]




class VisitError(LarkError): class VisitError(LarkError):


+ 0
- 0
lark-stubs/parsers/__init__.pyi View File


+ 22
- 0
lark-stubs/parsers/lalr_puppet.pyi View File

@@ -0,0 +1,22 @@
from typing import Set, Dict, Any

from lark import Token, Tree


class ParserPuppet(object):
"""
Provides an interface to interactively step through the parser (LALR(1) only for now)

Accessible via `UnexpectedToken.puppet` (raised by the parser on token error)
"""
def feed_token(self, token: Token): ...

def copy(self) -> ParserPuppet: ...

def pretty(self) -> str: ...

def choices(self) -> Dict[str, Any]: ...

def accepts(self) -> Set[str]: ...

def resume_parse(self) -> Tree: ...

+ 36
- 16
lark/exceptions.py View File

@@ -1,3 +1,5 @@
import logging

from .utils import STRING_TYPE from .utils import STRING_TYPE


###{standalone ###{standalone
@@ -37,34 +39,46 @@ class UnexpectedInput(LarkError):
after = text[pos:end].split(b'\n', 1)[0] after = text[pos:end].split(b'\n', 1)[0]
return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace")


def match_examples(self, parse_fn, examples, token_type_match_fallback=False):
def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False):
""" Given a parser instance and a dictionary mapping some label with """ Given a parser instance and a dictionary mapping some label with
some malformed syntax examples, it'll return the label for the some malformed syntax examples, it'll return the label for the
example that bests matches the current error. example that bests matches the current error.

It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility.
""" """
assert self.state is not None, "Not supported for this exception" assert self.state is not None, "Not supported for this exception"


if isinstance(examples, dict):
examples = examples.items()

candidate = (None, False) candidate = (None, False)
for label, example in examples.items():
for i, (label, example) in enumerate(examples):
assert not isinstance(example, STRING_TYPE) assert not isinstance(example, STRING_TYPE)


for malformed in example:
for j, malformed in enumerate(example):
try: try:
parse_fn(malformed) parse_fn(malformed)
except UnexpectedInput as ut: except UnexpectedInput as ut:
if ut.state == self.state: if ut.state == self.state:
if use_accepts and ut.accepts != self.accepts:
logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
(self.state, self.accepts, ut.accepts, i, j))
continue
try: try:
if ut.token == self.token: # Try exact match first if ut.token == self.token: # Try exact match first
logging.debug("Exact Match at example [%s][%s]" % (i, j))
return label return label


if token_type_match_fallback: if token_type_match_fallback:
# Fallback to token types match # Fallback to token types match
if (ut.token.type == self.token.type) and not candidate[-1]: if (ut.token.type == self.token.type) and not candidate[-1]:
logging.debug("Token Type Fallback at example [%s][%s]" % (i, j))
candidate = label, True candidate = label, True


except AttributeError: except AttributeError:
pass pass
if not candidate[0]: if not candidate[0]:
logging.debug("Same State match at example [%s][%s]" % (i, j))
candidate = label, False candidate = label, False


return candidate[0] return candidate[0]
@@ -72,19 +86,20 @@ class UnexpectedInput(LarkError):


class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):

if isinstance(seq, bytes):
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column)
else:
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)

self.line = line self.line = line
self.column = column self.column = column
self.allowed = allowed
self.considered_tokens = considered_tokens
self.pos_in_stream = lex_pos self.pos_in_stream = lex_pos
self.state = state self.state = state


self.allowed = allowed
self.considered_tokens = considered_tokens

if isinstance(seq, bytes):
_s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
else:
_s = seq[lex_pos]

message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column)
message += '\n\n' + self.get_context(seq) message += '\n\n' + self.get_context(seq)
if allowed: if allowed:
message += '\nExpecting: %s\n' % allowed message += '\nExpecting: %s\n' % allowed
@@ -97,18 +112,23 @@ class UnexpectedCharacters(LexError, UnexpectedInput):


class UnexpectedToken(ParseError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput):
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
self.token = token
self.expected = expected # XXX str shouldn't necessary
self.line = getattr(token, 'line', '?') self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?') self.column = getattr(token, 'column', '?')
self.considered_rules = considered_rules
self.state = state
self.pos_in_stream = getattr(token, 'pos_in_stream', None) self.pos_in_stream = getattr(token, 'pos_in_stream', None)
self.state = state

self.token = token
self.expected = expected # XXX deprecate? `accepts` is better
self.considered_rules = considered_rules
self.puppet = puppet self.puppet = puppet


# TODO Only calculate `accepts()` when we need to display it to the user
# This will improve performance when doing automatic error handling
self.accepts = puppet and puppet.accepts()

message = ("Unexpected token %r at line %s, column %s.\n" message = ("Unexpected token %r at line %s, column %s.\n"
"Expected one of: \n\t* %s\n" "Expected one of: \n\t* %s\n"
% (token, self.line, self.column, '\n\t* '.join(self.expected)))
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))


super(UnexpectedToken, self).__init__(message) super(UnexpectedToken, self).__init__(message)




+ 15
- 6
lark/load_grammar.py View File

@@ -85,7 +85,7 @@ TERMINALS = {
'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
'TERMINAL': '_?[A-Z][_A-Z0-9]*', 'TERMINAL': '_?[A-Z][_A-Z0-9]*',
'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS,
'_NL': r'(\r?\n)+\s*', '_NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+', 'WS': r'[ \t]+',
'COMMENT': r'\s*//[^\n]*', 'COMMENT': r'\s*//[^\n]*',
@@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
term_name = None term_name = None


elif isinstance(p, PatternRE): elif isinstance(p, PatternRE):
if p in self.term_reverse: # Kind of a wierd placement.name
if p in self.term_reverse: # Kind of a weird placement.name
term_name = self.term_reverse[p].name term_name = self.term_reverse[p].name
else: else:
assert False, p assert False, p
@@ -409,6 +409,13 @@ def _literal_to_pattern(literal):
flags = v[flag_start:] flags = v[flag_start:]
assert all(f in _RE_FLAGS for f in flags), flags assert all(f in _RE_FLAGS for f in flags), flags


if literal.type == 'STRING' and '\n' in v:
raise GrammarError('You cannot put newlines in string literals')

if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags:
raise GrammarError('You can only use newlines in regular expressions '
'with the `x` (verbose) flag')

v = v[:flag_start] v = v[:flag_start]
assert v[0] == v[-1] and v[0] in '"/' assert v[0] == v[-1] and v[0] in '"/'
x = v[1:-1] x = v[1:-1]
@@ -417,9 +424,11 @@ def _literal_to_pattern(literal):


if literal.type == 'STRING': if literal.type == 'STRING':
s = s.replace('\\\\', '\\') s = s.replace('\\\\', '\\')

return { 'STRING': PatternStr,
'REGEXP': PatternRE }[literal.type](s, flags)
return PatternStr(s, flags)
elif literal.type == 'REGEXP':
return PatternRE(s, flags)
else:
assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]'




@inline_args @inline_args
@@ -841,7 +850,7 @@ class GrammarLoader:
if len(stmt.children) > 1: if len(stmt.children) > 1:
path_node, arg1 = stmt.children path_node, arg1 = stmt.children
else: else:
path_node, = stmt.children
path_node ,= stmt.children
arg1 = None arg1 = None


if isinstance(arg1, Tree): # Multi import if isinstance(arg1, Tree): # Multi import


+ 2
- 2
lark/parsers/lalr_parser.py View File

@@ -59,10 +59,10 @@ class _Parser:
try: try:
return states[state][token.type] return states[state][token.type]
except KeyError: except KeyError:
expected = [s for s in states[state].keys() if s.isupper()]
expected = {s for s in states[state].keys() if s.isupper()}
try: try:
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
except NameError:
except NameError: # For standalone parser
puppet = None puppet = None
raise UnexpectedToken(token, expected, state=state, puppet=puppet) raise UnexpectedToken(token, expected, state=state, puppet=puppet)




+ 22
- 7
lark/parsers/lalr_puppet.py View File

@@ -3,8 +3,10 @@
from copy import deepcopy from copy import deepcopy


from .lalr_analysis import Shift, Reduce from .lalr_analysis import Shift, Reduce
from .. import Token


class ParserPuppet:

class ParserPuppet(object):
def __init__(self, parser, state_stack, value_stack, start, stream, set_state): def __init__(self, parser, state_stack, value_stack, start, stream, set_state):
self.parser = parser self.parser = parser
self._state_stack = state_stack self._state_stack = state_stack
@@ -16,7 +18,7 @@ class ParserPuppet:
self.result = None self.result = None


def feed_token(self, token): def feed_token(self, token):
"""Advance the parser state, as if it just recieved `token` from the lexer
"""Advance the parser state, as if it just received `token` from the lexer


""" """
end_state = self.parser.parse_table.end_states[self._start] end_state = self.parser.parse_table.end_states[self._start]
@@ -66,14 +68,27 @@ class ParserPuppet:
self._set_state, self._set_state,
) )


def pretty():
print("Puppet choices:")
for k, v in self.choices.items():
print('\t-', k, '->', v)
print('stack size:', len(self._state_stack))
def pretty(self):
out = ["Puppet choices:"]
for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v))
out.append('stack size: %s' % len(self._state_stack))
return '\n'.join(out)


def choices(self): def choices(self):
return self.parser.parse_table.states[self._state_stack[-1]] return self.parser.parse_table.states[self._state_stack[-1]]


def accepts(self):
accepts = set()
for t in self.choices():
new_puppet = self.copy()
try:
new_puppet.feed_token(Token(t, ''))
except KeyError:
pass
else:
accepts.add(t)
return accepts

def resume_parse(self): def resume_parse(self):
return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack)

+ 26
- 0
tests/test_parser.py View File

@@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER):
tree = l.parse('aA') tree = l.parse('aA')
self.assertEqual(tree.children, ['a', 'A']) self.assertEqual(tree.children, ['a', 'A'])


def test_token_flags_verbose(self):
g = _Lark(r"""start: NL | ABC
ABC: / [a-z] /x
NL: /\n/
""")
x = g.parse('a')
self.assertEqual(x.children, ['a'])

def test_token_flags_verbose_multiline(self):
g = _Lark(r"""start: ABC
ABC: / a b c
d
e f
/x
""")
x = g.parse('abcdef')
self.assertEqual(x.children, ['abcdef'])

def test_token_multiline_only_works_with_x_flag(self):
g = r"""start: ABC
ABC: / a b c
d
e f
/i
"""
self.assertRaises( GrammarError, _Lark, g)


@unittest.skipIf(PARSER == 'cyk', "No empty rules") @unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_twice_empty(self): def test_twice_empty(self):


Loading…
Cancel
Save