@@ -52,7 +52,7 @@ def parse(json_text): | |||
'[1,2,]', | |||
'{"foo":1,}', | |||
'{"foo":false,"bar":true,}'] | |||
}) | |||
}, use_accepts=True) | |||
if not exc_class: | |||
raise | |||
raise exc_class(u.get_context(json_text), u.line, u.column) | |||
@@ -1,9 +1,9 @@ | |||
# -*- coding: utf-8 -*- | |||
from typing import Dict, Iterable, Callable, Union | |||
from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set | |||
from .tree import Tree | |||
from .lexer import Token | |||
from .parsers.lalr_puppet import ParserPuppet | |||
class LarkError(Exception): | |||
pass | |||
@@ -21,27 +21,37 @@ class LexError(LarkError): | |||
pass | |||
T = TypeVar('T') | |||
class UnexpectedInput(LarkError): | |||
line: int | |||
column: int | |||
pos_in_stream: int | |||
state: Any | |||
def get_context(self, text: str, span: int = ...): | |||
... | |||
def match_examples( | |||
self, | |||
parse_fn: Callable[[str], Tree], | |||
examples: Dict[str, Iterable[str]] | |||
): | |||
self, | |||
parse_fn: Callable[[str], Tree], | |||
examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], | |||
token_type_match_fallback: bool = False, | |||
use_accepts: bool = False, | |||
) -> T: | |||
... | |||
class UnexpectedToken(ParseError, UnexpectedInput): | |||
pass | |||
expected: Set[str] | |||
considered_rules: Set[str] | |||
puppet: ParserPuppet | |||
accepts: Set[str] | |||
class UnexpectedCharacters(LexError, UnexpectedInput): | |||
line: int | |||
column: int | |||
allowed: Set[str] | |||
considered_tokens: Set[Any] | |||
class VisitError(LarkError): | |||
@@ -0,0 +1,22 @@ | |||
from typing import Set, Dict, Any | |||
from lark import Token, Tree | |||
class ParserPuppet(object): | |||
""" | |||
Provides an interface to interactively step through the parser (LALR(1) only for now) | |||
Accessible via `UnexpectedToken.puppet` (raised by the parser on token error) | |||
""" | |||
def feed_token(self, token: Token): ... | |||
def copy(self) -> ParserPuppet: ... | |||
def pretty(self) -> str: ... | |||
def choices(self) -> Dict[str, Any]: ... | |||
def accepts(self) -> Set[str]: ... | |||
def resume_parse(self) -> Tree: ... |
@@ -1,3 +1,5 @@ | |||
import logging | |||
from .utils import STRING_TYPE | |||
###{standalone | |||
@@ -37,34 +39,46 @@ class UnexpectedInput(LarkError): | |||
after = text[pos:end].split(b'\n', 1)[0] | |||
return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") | |||
def match_examples(self, parse_fn, examples, token_type_match_fallback=False): | |||
def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): | |||
""" Given a parser instance and a dictionary mapping some label with | |||
some malformed syntax examples, it'll return the label for the | |||
example that bests matches the current error. | |||
It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. | |||
""" | |||
assert self.state is not None, "Not supported for this exception" | |||
if isinstance(examples, dict): | |||
examples = examples.items() | |||
candidate = (None, False) | |||
for label, example in examples.items(): | |||
for i, (label, example) in enumerate(examples): | |||
assert not isinstance(example, STRING_TYPE) | |||
for malformed in example: | |||
for j, malformed in enumerate(example): | |||
try: | |||
parse_fn(malformed) | |||
except UnexpectedInput as ut: | |||
if ut.state == self.state: | |||
if use_accepts and ut.accepts != self.accepts: | |||
logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % | |||
(self.state, self.accepts, ut.accepts, i, j)) | |||
continue | |||
try: | |||
if ut.token == self.token: # Try exact match first | |||
logging.debug("Exact Match at example [%s][%s]" % (i, j)) | |||
return label | |||
if token_type_match_fallback: | |||
# Fallback to token types match | |||
if (ut.token.type == self.token.type) and not candidate[-1]: | |||
logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) | |||
candidate = label, True | |||
except AttributeError: | |||
pass | |||
if not candidate[0]: | |||
logging.debug("Same State match at example [%s][%s]" % (i, j)) | |||
candidate = label, False | |||
return candidate[0] | |||
@@ -72,19 +86,20 @@ class UnexpectedInput(LarkError): | |||
class UnexpectedCharacters(LexError, UnexpectedInput): | |||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||
if isinstance(seq, bytes): | |||
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) | |||
else: | |||
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) | |||
self.line = line | |||
self.column = column | |||
self.allowed = allowed | |||
self.considered_tokens = considered_tokens | |||
self.pos_in_stream = lex_pos | |||
self.state = state | |||
self.allowed = allowed | |||
self.considered_tokens = considered_tokens | |||
if isinstance(seq, bytes): | |||
_s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||
else: | |||
_s = seq[lex_pos] | |||
message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) | |||
message += '\n\n' + self.get_context(seq) | |||
if allowed: | |||
message += '\nExpecting: %s\n' % allowed | |||
@@ -97,18 +112,23 @@ class UnexpectedCharacters(LexError, UnexpectedInput): | |||
class UnexpectedToken(ParseError, UnexpectedInput): | |||
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | |||
self.token = token | |||
self.expected = expected # XXX str shouldn't necessary | |||
self.line = getattr(token, 'line', '?') | |||
self.column = getattr(token, 'column', '?') | |||
self.considered_rules = considered_rules | |||
self.state = state | |||
self.pos_in_stream = getattr(token, 'pos_in_stream', None) | |||
self.state = state | |||
self.token = token | |||
self.expected = expected # XXX deprecate? `accepts` is better | |||
self.considered_rules = considered_rules | |||
self.puppet = puppet | |||
# TODO Only calculate `accepts()` when we need to display it to the user | |||
# This will improve performance when doing automatic error handling | |||
self.accepts = puppet and puppet.accepts() | |||
message = ("Unexpected token %r at line %s, column %s.\n" | |||
"Expected one of: \n\t* %s\n" | |||
% (token, self.line, self.column, '\n\t* '.join(self.expected))) | |||
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||
super(UnexpectedToken, self).__init__(message) | |||
@@ -85,7 +85,7 @@ TERMINALS = { | |||
'RULE': '!?[_?]?[a-z][_a-z0-9]*', | |||
'TERMINAL': '_?[A-Z][_A-Z0-9]*', | |||
'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', | |||
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, | |||
'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, | |||
'_NL': r'(\r?\n)+\s*', | |||
'WS': r'[ \t]+', | |||
'COMMENT': r'\s*//[^\n]*', | |||
@@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
term_name = None | |||
elif isinstance(p, PatternRE): | |||
if p in self.term_reverse: # Kind of a wierd placement.name | |||
if p in self.term_reverse: # Kind of a weird placement.name | |||
term_name = self.term_reverse[p].name | |||
else: | |||
assert False, p | |||
@@ -409,6 +409,13 @@ def _literal_to_pattern(literal): | |||
flags = v[flag_start:] | |||
assert all(f in _RE_FLAGS for f in flags), flags | |||
if literal.type == 'STRING' and '\n' in v: | |||
raise GrammarError('You cannot put newlines in string literals') | |||
if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: | |||
raise GrammarError('You can only use newlines in regular expressions ' | |||
'with the `x` (verbose) flag') | |||
v = v[:flag_start] | |||
assert v[0] == v[-1] and v[0] in '"/' | |||
x = v[1:-1] | |||
@@ -417,9 +424,11 @@ def _literal_to_pattern(literal): | |||
if literal.type == 'STRING': | |||
s = s.replace('\\\\', '\\') | |||
return { 'STRING': PatternStr, | |||
'REGEXP': PatternRE }[literal.type](s, flags) | |||
return PatternStr(s, flags) | |||
elif literal.type == 'REGEXP': | |||
return PatternRE(s, flags) | |||
else: | |||
assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' | |||
@inline_args | |||
@@ -841,7 +850,7 @@ class GrammarLoader: | |||
if len(stmt.children) > 1: | |||
path_node, arg1 = stmt.children | |||
else: | |||
path_node, = stmt.children | |||
path_node ,= stmt.children | |||
arg1 = None | |||
if isinstance(arg1, Tree): # Multi import | |||
@@ -59,10 +59,10 @@ class _Parser: | |||
try: | |||
return states[state][token.type] | |||
except KeyError: | |||
expected = [s for s in states[state].keys() if s.isupper()] | |||
expected = {s for s in states[state].keys() if s.isupper()} | |||
try: | |||
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | |||
except NameError: | |||
except NameError: # For standalone parser | |||
puppet = None | |||
raise UnexpectedToken(token, expected, state=state, puppet=puppet) | |||
@@ -3,8 +3,10 @@ | |||
from copy import deepcopy | |||
from .lalr_analysis import Shift, Reduce | |||
from .. import Token | |||
class ParserPuppet: | |||
class ParserPuppet(object): | |||
def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | |||
self.parser = parser | |||
self._state_stack = state_stack | |||
@@ -16,7 +18,7 @@ class ParserPuppet: | |||
self.result = None | |||
def feed_token(self, token): | |||
"""Advance the parser state, as if it just recieved `token` from the lexer | |||
"""Advance the parser state, as if it just received `token` from the lexer | |||
""" | |||
end_state = self.parser.parse_table.end_states[self._start] | |||
@@ -66,14 +68,27 @@ class ParserPuppet: | |||
self._set_state, | |||
) | |||
def pretty(): | |||
print("Puppet choices:") | |||
for k, v in self.choices.items(): | |||
print('\t-', k, '->', v) | |||
print('stack size:', len(self._state_stack)) | |||
def pretty(self): | |||
out = ["Puppet choices:"] | |||
for k, v in self.choices().items(): | |||
out.append('\t- %s -> %s' % (k, v)) | |||
out.append('stack size: %s' % len(self._state_stack)) | |||
return '\n'.join(out) | |||
def choices(self): | |||
return self.parser.parse_table.states[self._state_stack[-1]] | |||
def accepts(self): | |||
accepts = set() | |||
for t in self.choices(): | |||
new_puppet = self.copy() | |||
try: | |||
new_puppet.feed_token(Token(t, '')) | |||
except KeyError: | |||
pass | |||
else: | |||
accepts.add(t) | |||
return accepts | |||
def resume_parse(self): | |||
return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) |
@@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER): | |||
tree = l.parse('aA') | |||
self.assertEqual(tree.children, ['a', 'A']) | |||
def test_token_flags_verbose(self): | |||
g = _Lark(r"""start: NL | ABC | |||
ABC: / [a-z] /x | |||
NL: /\n/ | |||
""") | |||
x = g.parse('a') | |||
self.assertEqual(x.children, ['a']) | |||
def test_token_flags_verbose_multiline(self): | |||
g = _Lark(r"""start: ABC | |||
ABC: / a b c | |||
d | |||
e f | |||
/x | |||
""") | |||
x = g.parse('abcdef') | |||
self.assertEqual(x.children, ['abcdef']) | |||
def test_token_multiline_only_works_with_x_flag(self): | |||
g = r"""start: ABC | |||
ABC: / a b c | |||
d | |||
e f | |||
/i | |||
""" | |||
self.assertRaises( GrammarError, _Lark, g) | |||
@unittest.skipIf(PARSER == 'cyk', "No empty rules") | |||
def test_twice_empty(self): | |||