Browse Source

Merge branch 'error-handling' of https://github.com/MegaIng/lark into MegaIng-error-handling

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.10.0
Erez Sh 4 years ago
parent
commit
c19c803340
8 changed files with 126 additions and 33 deletions
  1. +20
    -10
      lark-stubs/exceptions.pyi
  2. +0
    -0
      lark-stubs/parsers/__init__.pyi
  3. +21
    -0
      lark-stubs/parsers/lalr_puppet.pyi
  4. +18
    -7
      lark/exceptions.py
  5. +15
    -6
      lark/load_grammar.py
  6. +4
    -3
      lark/parsers/lalr_parser.py
  7. +22
    -7
      lark/parsers/lalr_puppet.py
  8. +26
    -0
      tests/test_parser.py

+ 20
- 10
lark-stubs/exceptions.pyi View File

@@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-

from typing import Dict, Iterable, Callable, Union
from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set
from .tree import Tree
from .lexer import Token
from .parsers.lalr_puppet import ParserPuppet

class LarkError(Exception):
pass
@@ -21,27 +21,37 @@ class LexError(LarkError):
pass


T = TypeVar('T')


class UnexpectedInput(LarkError):
line: int
column: int
pos_in_stream: int
state: Any

def get_context(self, text: str, span: int = ...):
...

def match_examples(
self,
parse_fn: Callable[[str], Tree],
examples: Dict[str, Iterable[str]]
):
self,
parse_fn: Callable[[str], Tree],
examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]],
token_type_match_fallback: bool = False,
use_accepts: bool = False,
) -> T:
...


class UnexpectedToken(ParseError, UnexpectedInput):
pass

expected: Set[str]
considered_rules: Set[str]
puppet: ParserPuppet
accepts: Set[str]

class UnexpectedCharacters(LexError, UnexpectedInput):
line: int
column: int
allowed: Set[str]
considered_tokens: Set[Any]


class VisitError(LarkError):


+ 0
- 0
lark-stubs/parsers/__init__.pyi View File


+ 21
- 0
lark-stubs/parsers/lalr_puppet.pyi View File

@@ -0,0 +1,21 @@
from typing import Set, Dict, Any

from lark import Token, Tree


class ParserPuppet(object):
"""
Represents a LalrParser that can be step through.
Shouldn't instantiated by hand, but is accessible as `UnexpectedToken.puppet`
"""
def feed_token(self, token: Token): ...

def copy(self) -> ParserPuppet: ...

def pretty(self) -> str: ...

def choices(self) -> Dict[str, Any]: ...

def accepts(self) -> Set[str]: ...

def resume_parse(self) -> Tree: ...

+ 18
- 7
lark/exceptions.py View File

@@ -1,3 +1,5 @@
import logging

from .utils import STRING_TYPE

###{standalone
@@ -37,36 +39,44 @@ class UnexpectedInput(LarkError):
after = text[pos:end].split(b'\n', 1)[0]
return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace")

def match_examples(self, parse_fn, examples, token_type_match_fallback=False):
def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False):
""" Given a parser instance and a dictionary mapping some label with
some malformed syntax examples, it'll return the label for the
example that bests matches the current error.
"""
assert self.state is not None, "Not supported for this exception"
if isinstance(examples, dict):
examples = examples.items()

candidate = (None, False)
for label, example in examples.items():
for i, (label, example) in enumerate(examples):
assert not isinstance(example, STRING_TYPE)

for malformed in example:
for j, malformed in enumerate(example):
try:
parse_fn(malformed)
except UnexpectedInput as ut:
if ut.state == self.state:
if ut.state == self.state and (not use_accepts or ut.accepts == self.accepts):
try:
if ut.token == self.token: # Try exact match first
logging.debug("Exact Match at example [%s][%s]" % (i, j))
return label

if token_type_match_fallback:
# Fallback to token types match
if (ut.token.type == self.token.type) and not candidate[-1]:
logging.debug("Token Type Fallback at example [%s][%s]" % (i, j))
candidate = label, True

except AttributeError:
pass
if not candidate[0]:
logging.debug("Same State match at example [%s][%s]" % (i, j))
candidate = label, False

elif ut.state == self.state:
logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
(self.state, self.accepts, ut.accepts, i, j))
return candidate[0]


@@ -96,7 +106,7 @@ class UnexpectedCharacters(LexError, UnexpectedInput):


class UnexpectedToken(ParseError, UnexpectedInput):
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, accepts=None):
self.token = token
self.expected = expected # XXX str shouldn't necessary
self.line = getattr(token, 'line', '?')
@@ -105,10 +115,11 @@ class UnexpectedToken(ParseError, UnexpectedInput):
self.state = state
self.pos_in_stream = getattr(token, 'pos_in_stream', None)
self.puppet = puppet
self.accepts = accepts

message = ("Unexpected token %r at line %s, column %s.\n"
"Expected one of: \n\t* %s\n"
% (token, self.line, self.column, '\n\t* '.join(self.expected)))
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))

super(UnexpectedToken, self).__init__(message)



+ 15
- 6
lark/load_grammar.py View File

@@ -85,7 +85,7 @@ TERMINALS = {
'RULE': '!?[_?]?[a-z][_a-z0-9]*',
'TERMINAL': '_?[A-Z][_A-Z0-9]*',
'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS,
'_NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+',
'COMMENT': r'\s*//[^\n]*',
@@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
term_name = None

elif isinstance(p, PatternRE):
if p in self.term_reverse: # Kind of a wierd placement.name
if p in self.term_reverse: # Kind of a weird placement.name
term_name = self.term_reverse[p].name
else:
assert False, p
@@ -409,6 +409,13 @@ def _literal_to_pattern(literal):
flags = v[flag_start:]
assert all(f in _RE_FLAGS for f in flags), flags

if literal.type == 'STRING' and '\n' in v:
raise GrammarError('You cannot put newlines in string literals')

if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags:
raise GrammarError('You can only use newlines in regular expressions '
'with the `x` (verbose) flag')

v = v[:flag_start]
assert v[0] == v[-1] and v[0] in '"/'
x = v[1:-1]
@@ -417,9 +424,11 @@ def _literal_to_pattern(literal):

if literal.type == 'STRING':
s = s.replace('\\\\', '\\')

return { 'STRING': PatternStr,
'REGEXP': PatternRE }[literal.type](s, flags)
return PatternStr(s, flags)
elif literal.type == 'REGEXP':
return PatternRE(s, flags)
else:
assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]'


@inline_args
@@ -841,7 +850,7 @@ class GrammarLoader:
if len(stmt.children) > 1:
path_node, arg1 = stmt.children
else:
path_node, = stmt.children
path_node ,= stmt.children
arg1 = None

if isinstance(arg1, Tree): # Multi import


+ 4
- 3
lark/parsers/lalr_parser.py View File

@@ -59,12 +59,13 @@ class _Parser:
try:
return states[state][token.type]
except KeyError:
expected = [s for s in states[state].keys() if s.isupper()]
expected = {s for s in states[state].keys() if s.isupper()}
try:
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state)
accepts = puppet.accepts()
except NameError:
puppet = None
raise UnexpectedToken(token, expected, state=state, puppet=puppet)
puppet = accepts = None
raise UnexpectedToken(token, expected, state=state, puppet=puppet, accepts=accepts)

def reduce(rule):
size = len(rule.expansion)


+ 22
- 7
lark/parsers/lalr_puppet.py View File

@@ -3,8 +3,10 @@
from copy import deepcopy

from .lalr_analysis import Shift, Reduce
from .. import Token

class ParserPuppet:

class ParserPuppet(object):
def __init__(self, parser, state_stack, value_stack, start, stream, set_state):
self.parser = parser
self._state_stack = state_stack
@@ -16,7 +18,7 @@ class ParserPuppet:
self.result = None

def feed_token(self, token):
"""Advance the parser state, as if it just recieved `token` from the lexer
"""Advance the parser state, as if it just received `token` from the lexer

"""
end_state = self.parser.parse_table.end_states[self._start]
@@ -66,14 +68,27 @@ class ParserPuppet:
self._set_state,
)

def pretty():
print("Puppet choices:")
for k, v in self.choices.items():
print('\t-', k, '->', v)
print('stack size:', len(self._state_stack))
def pretty(self):
out = ["Puppet choices:"]
for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v))
out.append('stack size: %s' % len(self._state_stack))
return '\n'.join(out)

def choices(self):
return self.parser.parse_table.states[self._state_stack[-1]]

def accepts(self):
accepts = set()
for t in self.choices():
new_puppet = self.copy()
try:
new_puppet.feed_token(Token(t, ''))
except KeyError:
pass
else:
accepts.add(t)
return accepts

def resume_parse(self):
return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack)

+ 26
- 0
tests/test_parser.py View File

@@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER):
tree = l.parse('aA')
self.assertEqual(tree.children, ['a', 'A'])

def test_token_flags_verbose(self):
g = _Lark(r"""start: NL | ABC
ABC: / [a-z] /x
NL: /\n/
""")
x = g.parse('a')
self.assertEqual(x.children, ['a'])

def test_token_flags_verbose_multiline(self):
g = _Lark(r"""start: ABC
ABC: / a b c
d
e f
/x
""")
x = g.parse('abcdef')
self.assertEqual(x.children, ['abcdef'])

def test_token_multiline_only_works_with_x_flag(self):
g = r"""start: ABC
ABC: / a b c
d
e f
/i
"""
self.assertRaises( GrammarError, _Lark, g)

@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_twice_empty(self):


Loading…
Cancel
Save