@@ -52,7 +52,7 @@ def parse(json_text): | |||||
'[1,2,]', | '[1,2,]', | ||||
'{"foo":1,}', | '{"foo":1,}', | ||||
'{"foo":false,"bar":true,}'] | '{"foo":false,"bar":true,}'] | ||||
}) | |||||
}, use_accepts=True) | |||||
if not exc_class: | if not exc_class: | ||||
raise | raise | ||||
raise exc_class(u.get_context(json_text), u.line, u.column) | raise exc_class(u.get_context(json_text), u.line, u.column) | ||||
@@ -1,9 +1,9 @@ | |||||
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||
from typing import Dict, Iterable, Callable, Union | |||||
from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set | |||||
from .tree import Tree | from .tree import Tree | ||||
from .lexer import Token | from .lexer import Token | ||||
from .parsers.lalr_puppet import ParserPuppet | |||||
class LarkError(Exception): | class LarkError(Exception): | ||||
pass | pass | ||||
@@ -21,27 +21,37 @@ class LexError(LarkError): | |||||
pass | pass | ||||
T = TypeVar('T') | |||||
class UnexpectedInput(LarkError): | class UnexpectedInput(LarkError): | ||||
line: int | |||||
column: int | |||||
pos_in_stream: int | pos_in_stream: int | ||||
state: Any | |||||
def get_context(self, text: str, span: int = ...): | def get_context(self, text: str, span: int = ...): | ||||
... | ... | ||||
def match_examples( | def match_examples( | ||||
self, | |||||
parse_fn: Callable[[str], Tree], | |||||
examples: Dict[str, Iterable[str]] | |||||
): | |||||
self, | |||||
parse_fn: Callable[[str], Tree], | |||||
examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], | |||||
token_type_match_fallback: bool = False, | |||||
use_accepts: bool = False, | |||||
) -> T: | |||||
... | ... | ||||
class UnexpectedToken(ParseError, UnexpectedInput): | class UnexpectedToken(ParseError, UnexpectedInput): | ||||
pass | |||||
expected: Set[str] | |||||
considered_rules: Set[str] | |||||
puppet: ParserPuppet | |||||
accepts: Set[str] | |||||
class UnexpectedCharacters(LexError, UnexpectedInput): | class UnexpectedCharacters(LexError, UnexpectedInput): | ||||
line: int | |||||
column: int | |||||
allowed: Set[str] | |||||
considered_tokens: Set[Any] | |||||
class VisitError(LarkError): | class VisitError(LarkError): | ||||
@@ -0,0 +1,22 @@ | |||||
from typing import Set, Dict, Any | |||||
from lark import Token, Tree | |||||
class ParserPuppet(object): | |||||
""" | |||||
Provides an interface to interactively step through the parser (LALR(1) only for now) | |||||
Accessible via `UnexpectedToken.puppet` (raised by the parser on token error) | |||||
""" | |||||
def feed_token(self, token: Token): ... | |||||
def copy(self) -> ParserPuppet: ... | |||||
def pretty(self) -> str: ... | |||||
def choices(self) -> Dict[str, Any]: ... | |||||
def accepts(self) -> Set[str]: ... | |||||
def resume_parse(self) -> Tree: ... |
@@ -1,3 +1,5 @@ | |||||
import logging | |||||
from .utils import STRING_TYPE | from .utils import STRING_TYPE | ||||
###{standalone | ###{standalone | ||||
@@ -37,34 +39,46 @@ class UnexpectedInput(LarkError): | |||||
after = text[pos:end].split(b'\n', 1)[0] | after = text[pos:end].split(b'\n', 1)[0] | ||||
return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") | return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") | ||||
def match_examples(self, parse_fn, examples, token_type_match_fallback=False): | |||||
def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): | |||||
""" Given a parser instance and a dictionary mapping some label with | """ Given a parser instance and a dictionary mapping some label with | ||||
some malformed syntax examples, it'll return the label for the | some malformed syntax examples, it'll return the label for the | ||||
example that bests matches the current error. | example that bests matches the current error. | ||||
It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. | |||||
""" | """ | ||||
assert self.state is not None, "Not supported for this exception" | assert self.state is not None, "Not supported for this exception" | ||||
if isinstance(examples, dict): | |||||
examples = examples.items() | |||||
candidate = (None, False) | candidate = (None, False) | ||||
for label, example in examples.items(): | |||||
for i, (label, example) in enumerate(examples): | |||||
assert not isinstance(example, STRING_TYPE) | assert not isinstance(example, STRING_TYPE) | ||||
for malformed in example: | |||||
for j, malformed in enumerate(example): | |||||
try: | try: | ||||
parse_fn(malformed) | parse_fn(malformed) | ||||
except UnexpectedInput as ut: | except UnexpectedInput as ut: | ||||
if ut.state == self.state: | if ut.state == self.state: | ||||
if use_accepts and ut.accepts != self.accepts: | |||||
logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % | |||||
(self.state, self.accepts, ut.accepts, i, j)) | |||||
continue | |||||
try: | try: | ||||
if ut.token == self.token: # Try exact match first | if ut.token == self.token: # Try exact match first | ||||
logging.debug("Exact Match at example [%s][%s]" % (i, j)) | |||||
return label | return label | ||||
if token_type_match_fallback: | if token_type_match_fallback: | ||||
# Fallback to token types match | # Fallback to token types match | ||||
if (ut.token.type == self.token.type) and not candidate[-1]: | if (ut.token.type == self.token.type) and not candidate[-1]: | ||||
logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) | |||||
candidate = label, True | candidate = label, True | ||||
except AttributeError: | except AttributeError: | ||||
pass | pass | ||||
if not candidate[0]: | if not candidate[0]: | ||||
logging.debug("Same State match at example [%s][%s]" % (i, j)) | |||||
candidate = label, False | candidate = label, False | ||||
return candidate[0] | return candidate[0] | ||||
@@ -72,19 +86,20 @@ class UnexpectedInput(LarkError): | |||||
class UnexpectedCharacters(LexError, UnexpectedInput): | class UnexpectedCharacters(LexError, UnexpectedInput): | ||||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | ||||
if isinstance(seq, bytes): | |||||
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) | |||||
else: | |||||
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) | |||||
self.line = line | self.line = line | ||||
self.column = column | self.column = column | ||||
self.allowed = allowed | |||||
self.considered_tokens = considered_tokens | |||||
self.pos_in_stream = lex_pos | self.pos_in_stream = lex_pos | ||||
self.state = state | self.state = state | ||||
self.allowed = allowed | |||||
self.considered_tokens = considered_tokens | |||||
if isinstance(seq, bytes): | |||||
_s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||||
else: | |||||
_s = seq[lex_pos] | |||||
message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) | |||||
message += '\n\n' + self.get_context(seq) | message += '\n\n' + self.get_context(seq) | ||||
if allowed: | if allowed: | ||||
message += '\nExpecting: %s\n' % allowed | message += '\nExpecting: %s\n' % allowed | ||||
@@ -97,18 +112,23 @@ class UnexpectedCharacters(LexError, UnexpectedInput): | |||||
class UnexpectedToken(ParseError, UnexpectedInput): | class UnexpectedToken(ParseError, UnexpectedInput): | ||||
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | ||||
self.token = token | |||||
self.expected = expected # XXX str shouldn't necessary | |||||
self.line = getattr(token, 'line', '?') | self.line = getattr(token, 'line', '?') | ||||
self.column = getattr(token, 'column', '?') | self.column = getattr(token, 'column', '?') | ||||
self.considered_rules = considered_rules | |||||
self.state = state | |||||
self.pos_in_stream = getattr(token, 'pos_in_stream', None) | self.pos_in_stream = getattr(token, 'pos_in_stream', None) | ||||
self.state = state | |||||
self.token = token | |||||
self.expected = expected # XXX deprecate? `accepts` is better | |||||
self.considered_rules = considered_rules | |||||
self.puppet = puppet | self.puppet = puppet | ||||
# TODO Only calculate `accepts()` when we need to display it to the user | |||||
# This will improve performance when doing automatic error handling | |||||
self.accepts = puppet and puppet.accepts() | |||||
message = ("Unexpected token %r at line %s, column %s.\n" | message = ("Unexpected token %r at line %s, column %s.\n" | ||||
"Expected one of: \n\t* %s\n" | "Expected one of: \n\t* %s\n" | ||||
% (token, self.line, self.column, '\n\t* '.join(self.expected))) | |||||
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||||
super(UnexpectedToken, self).__init__(message) | super(UnexpectedToken, self).__init__(message) | ||||
@@ -85,7 +85,7 @@ TERMINALS = { | |||||
'RULE': '!?[_?]?[a-z][_a-z0-9]*', | 'RULE': '!?[_?]?[a-z][_a-z0-9]*', | ||||
'TERMINAL': '_?[A-Z][_A-Z0-9]*', | 'TERMINAL': '_?[A-Z][_A-Z0-9]*', | ||||
'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', | 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', | ||||
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, | |||||
'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, | |||||
'_NL': r'(\r?\n)+\s*', | '_NL': r'(\r?\n)+\s*', | ||||
'WS': r'[ \t]+', | 'WS': r'[ \t]+', | ||||
'COMMENT': r'\s*//[^\n]*', | 'COMMENT': r'\s*//[^\n]*', | ||||
@@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||||
term_name = None | term_name = None | ||||
elif isinstance(p, PatternRE): | elif isinstance(p, PatternRE): | ||||
if p in self.term_reverse: # Kind of a wierd placement.name | |||||
if p in self.term_reverse: # Kind of a weird placement.name | |||||
term_name = self.term_reverse[p].name | term_name = self.term_reverse[p].name | ||||
else: | else: | ||||
assert False, p | assert False, p | ||||
@@ -409,6 +409,13 @@ def _literal_to_pattern(literal): | |||||
flags = v[flag_start:] | flags = v[flag_start:] | ||||
assert all(f in _RE_FLAGS for f in flags), flags | assert all(f in _RE_FLAGS for f in flags), flags | ||||
if literal.type == 'STRING' and '\n' in v: | |||||
raise GrammarError('You cannot put newlines in string literals') | |||||
if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: | |||||
raise GrammarError('You can only use newlines in regular expressions ' | |||||
'with the `x` (verbose) flag') | |||||
v = v[:flag_start] | v = v[:flag_start] | ||||
assert v[0] == v[-1] and v[0] in '"/' | assert v[0] == v[-1] and v[0] in '"/' | ||||
x = v[1:-1] | x = v[1:-1] | ||||
@@ -417,9 +424,11 @@ def _literal_to_pattern(literal): | |||||
if literal.type == 'STRING': | if literal.type == 'STRING': | ||||
s = s.replace('\\\\', '\\') | s = s.replace('\\\\', '\\') | ||||
return { 'STRING': PatternStr, | |||||
'REGEXP': PatternRE }[literal.type](s, flags) | |||||
return PatternStr(s, flags) | |||||
elif literal.type == 'REGEXP': | |||||
return PatternRE(s, flags) | |||||
else: | |||||
assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' | |||||
@inline_args | @inline_args | ||||
@@ -841,7 +850,7 @@ class GrammarLoader: | |||||
if len(stmt.children) > 1: | if len(stmt.children) > 1: | ||||
path_node, arg1 = stmt.children | path_node, arg1 = stmt.children | ||||
else: | else: | ||||
path_node, = stmt.children | |||||
path_node ,= stmt.children | |||||
arg1 = None | arg1 = None | ||||
if isinstance(arg1, Tree): # Multi import | if isinstance(arg1, Tree): # Multi import | ||||
@@ -59,10 +59,10 @@ class _Parser: | |||||
try: | try: | ||||
return states[state][token.type] | return states[state][token.type] | ||||
except KeyError: | except KeyError: | ||||
expected = [s for s in states[state].keys() if s.isupper()] | |||||
expected = {s for s in states[state].keys() if s.isupper()} | |||||
try: | try: | ||||
puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | ||||
except NameError: | |||||
except NameError: # For standalone parser | |||||
puppet = None | puppet = None | ||||
raise UnexpectedToken(token, expected, state=state, puppet=puppet) | raise UnexpectedToken(token, expected, state=state, puppet=puppet) | ||||
@@ -3,8 +3,10 @@ | |||||
from copy import deepcopy | from copy import deepcopy | ||||
from .lalr_analysis import Shift, Reduce | from .lalr_analysis import Shift, Reduce | ||||
from .. import Token | |||||
class ParserPuppet: | |||||
class ParserPuppet(object): | |||||
def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | ||||
self.parser = parser | self.parser = parser | ||||
self._state_stack = state_stack | self._state_stack = state_stack | ||||
@@ -16,7 +18,7 @@ class ParserPuppet: | |||||
self.result = None | self.result = None | ||||
def feed_token(self, token): | def feed_token(self, token): | ||||
"""Advance the parser state, as if it just recieved `token` from the lexer | |||||
"""Advance the parser state, as if it just received `token` from the lexer | |||||
""" | """ | ||||
end_state = self.parser.parse_table.end_states[self._start] | end_state = self.parser.parse_table.end_states[self._start] | ||||
@@ -66,14 +68,27 @@ class ParserPuppet: | |||||
self._set_state, | self._set_state, | ||||
) | ) | ||||
def pretty(): | |||||
print("Puppet choices:") | |||||
for k, v in self.choices.items(): | |||||
print('\t-', k, '->', v) | |||||
print('stack size:', len(self._state_stack)) | |||||
def pretty(self): | |||||
out = ["Puppet choices:"] | |||||
for k, v in self.choices().items(): | |||||
out.append('\t- %s -> %s' % (k, v)) | |||||
out.append('stack size: %s' % len(self._state_stack)) | |||||
return '\n'.join(out) | |||||
def choices(self): | def choices(self): | ||||
return self.parser.parse_table.states[self._state_stack[-1]] | return self.parser.parse_table.states[self._state_stack[-1]] | ||||
def accepts(self): | |||||
accepts = set() | |||||
for t in self.choices(): | |||||
new_puppet = self.copy() | |||||
try: | |||||
new_puppet.feed_token(Token(t, '')) | |||||
except KeyError: | |||||
pass | |||||
else: | |||||
accepts.add(t) | |||||
return accepts | |||||
def resume_parse(self): | def resume_parse(self): | ||||
return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) | return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) |
@@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER): | |||||
tree = l.parse('aA') | tree = l.parse('aA') | ||||
self.assertEqual(tree.children, ['a', 'A']) | self.assertEqual(tree.children, ['a', 'A']) | ||||
def test_token_flags_verbose(self): | |||||
g = _Lark(r"""start: NL | ABC | |||||
ABC: / [a-z] /x | |||||
NL: /\n/ | |||||
""") | |||||
x = g.parse('a') | |||||
self.assertEqual(x.children, ['a']) | |||||
def test_token_flags_verbose_multiline(self): | |||||
g = _Lark(r"""start: ABC | |||||
ABC: / a b c | |||||
d | |||||
e f | |||||
/x | |||||
""") | |||||
x = g.parse('abcdef') | |||||
self.assertEqual(x.children, ['abcdef']) | |||||
def test_token_multiline_only_works_with_x_flag(self): | |||||
g = r"""start: ABC | |||||
ABC: / a b c | |||||
d | |||||
e f | |||||
/i | |||||
""" | |||||
self.assertRaises( GrammarError, _Lark, g) | |||||
@unittest.skipIf(PARSER == 'cyk', "No empty rules") | @unittest.skipIf(PARSER == 'cyk', "No empty rules") | ||||
def test_twice_empty(self): | def test_twice_empty(self): | ||||