| @@ -52,7 +52,7 @@ def parse(json_text): | |||
| '[1,2,]', | |||
| '{"foo":1,}', | |||
| '{"foo":false,"bar":true,}'] | |||
| }) | |||
| }, use_accepts=True) | |||
| if not exc_class: | |||
| raise | |||
| raise exc_class(u.get_context(json_text), u.line, u.column) | |||
| @@ -1,9 +1,9 @@ | |||
| # -*- coding: utf-8 -*- | |||
| from typing import Dict, Iterable, Callable, Union | |||
| from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set | |||
| from .tree import Tree | |||
| from .lexer import Token | |||
| from .parsers.lalr_puppet import ParserPuppet | |||
| class LarkError(Exception): | |||
| pass | |||
| @@ -21,27 +21,37 @@ class LexError(LarkError): | |||
| pass | |||
| T = TypeVar('T') | |||
| class UnexpectedInput(LarkError): | |||
| line: int | |||
| column: int | |||
| pos_in_stream: int | |||
| state: Any | |||
| def get_context(self, text: str, span: int = ...): | |||
| ... | |||
| def match_examples( | |||
| self, | |||
| parse_fn: Callable[[str], Tree], | |||
| examples: Dict[str, Iterable[str]] | |||
| ): | |||
| self, | |||
| parse_fn: Callable[[str], Tree], | |||
| examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], | |||
| token_type_match_fallback: bool = False, | |||
| use_accepts: bool = False, | |||
| ) -> T: | |||
| ... | |||
| class UnexpectedToken(ParseError, UnexpectedInput): | |||
| pass | |||
| expected: Set[str] | |||
| considered_rules: Set[str] | |||
| puppet: ParserPuppet | |||
| accepts: Set[str] | |||
| class UnexpectedCharacters(LexError, UnexpectedInput): | |||
| line: int | |||
| column: int | |||
| allowed: Set[str] | |||
| considered_tokens: Set[Any] | |||
| class VisitError(LarkError): | |||
| @@ -0,0 +1,22 @@ | |||
| from typing import Set, Dict, Any | |||
| from lark import Token, Tree | |||
| class ParserPuppet(object): | |||
| """ | |||
| Provides an interface to interactively step through the parser (LALR(1) only for now) | |||
| Accessible via `UnexpectedToken.puppet` (raised by the parser on token error) | |||
| """ | |||
| def feed_token(self, token: Token): ... | |||
| def copy(self) -> ParserPuppet: ... | |||
| def pretty(self) -> str: ... | |||
| def choices(self) -> Dict[str, Any]: ... | |||
| def accepts(self) -> Set[str]: ... | |||
| def resume_parse(self) -> Tree: ... | |||
| @@ -1,3 +1,5 @@ | |||
| import logging | |||
| from .utils import STRING_TYPE | |||
| ###{standalone | |||
| @@ -37,34 +39,46 @@ class UnexpectedInput(LarkError): | |||
| after = text[pos:end].split(b'\n', 1)[0] | |||
| return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") | |||
| def match_examples(self, parse_fn, examples, token_type_match_fallback=False): | |||
| def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): | |||
| """ Given a parser instance and a dictionary mapping some label with | |||
| some malformed syntax examples, it'll return the label for the | |||
| example that bests matches the current error. | |||
| It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. | |||
| """ | |||
| assert self.state is not None, "Not supported for this exception" | |||
| if isinstance(examples, dict): | |||
| examples = examples.items() | |||
| candidate = (None, False) | |||
| for label, example in examples.items(): | |||
| for i, (label, example) in enumerate(examples): | |||
| assert not isinstance(example, STRING_TYPE) | |||
| for malformed in example: | |||
| for j, malformed in enumerate(example): | |||
| try: | |||
| parse_fn(malformed) | |||
| except UnexpectedInput as ut: | |||
| if ut.state == self.state: | |||
| if use_accepts and ut.accepts != self.accepts: | |||
| logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % | |||
| (self.state, self.accepts, ut.accepts, i, j)) | |||
| continue | |||
| try: | |||
| if ut.token == self.token: # Try exact match first | |||
| logging.debug("Exact Match at example [%s][%s]" % (i, j)) | |||
| return label | |||
| if token_type_match_fallback: | |||
| # Fallback to token types match | |||
| if (ut.token.type == self.token.type) and not candidate[-1]: | |||
| logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) | |||
| candidate = label, True | |||
| except AttributeError: | |||
| pass | |||
| if not candidate[0]: | |||
| logging.debug("Same State match at example [%s][%s]" % (i, j)) | |||
| candidate = label, False | |||
| return candidate[0] | |||
| @@ -72,19 +86,20 @@ class UnexpectedInput(LarkError): | |||
| class UnexpectedCharacters(LexError, UnexpectedInput): | |||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||
| if isinstance(seq, bytes): | |||
| message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) | |||
| else: | |||
| message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) | |||
| self.line = line | |||
| self.column = column | |||
| self.allowed = allowed | |||
| self.considered_tokens = considered_tokens | |||
| self.pos_in_stream = lex_pos | |||
| self.state = state | |||
| self.allowed = allowed | |||
| self.considered_tokens = considered_tokens | |||
| if isinstance(seq, bytes): | |||
| _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||
| else: | |||
| _s = seq[lex_pos] | |||
| message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) | |||
| message += '\n\n' + self.get_context(seq) | |||
| if allowed: | |||
| message += '\nExpecting: %s\n' % allowed | |||
| @@ -97,18 +112,23 @@ class UnexpectedCharacters(LexError, UnexpectedInput): | |||
| class UnexpectedToken(ParseError, UnexpectedInput): | |||
| def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | |||
| self.token = token | |||
| self.expected = expected # XXX str shouldn't necessary | |||
| self.line = getattr(token, 'line', '?') | |||
| self.column = getattr(token, 'column', '?') | |||
| self.considered_rules = considered_rules | |||
| self.state = state | |||
| self.pos_in_stream = getattr(token, 'pos_in_stream', None) | |||
| self.state = state | |||
| self.token = token | |||
| self.expected = expected # XXX deprecate? `accepts` is better | |||
| self.considered_rules = considered_rules | |||
| self.puppet = puppet | |||
| # TODO Only calculate `accepts()` when we need to display it to the user | |||
| # This will improve performance when doing automatic error handling | |||
| self.accepts = puppet and puppet.accepts() | |||
| message = ("Unexpected token %r at line %s, column %s.\n" | |||
| "Expected one of: \n\t* %s\n" | |||
| % (token, self.line, self.column, '\n\t* '.join(self.expected))) | |||
| % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||
| super(UnexpectedToken, self).__init__(message) | |||
| @@ -85,7 +85,7 @@ TERMINALS = { | |||
| 'RULE': '!?[_?]?[a-z][_a-z0-9]*', | |||
| 'TERMINAL': '_?[A-Z][_A-Z0-9]*', | |||
| 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', | |||
| 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, | |||
| 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, | |||
| '_NL': r'(\r?\n)+\s*', | |||
| 'WS': r'[ \t]+', | |||
| 'COMMENT': r'\s*//[^\n]*', | |||
| @@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
| term_name = None | |||
| elif isinstance(p, PatternRE): | |||
| if p in self.term_reverse: # Kind of a wierd placement.name | |||
| if p in self.term_reverse: # Kind of a weird placement.name | |||
| term_name = self.term_reverse[p].name | |||
| else: | |||
| assert False, p | |||
| @@ -409,6 +409,13 @@ def _literal_to_pattern(literal): | |||
| flags = v[flag_start:] | |||
| assert all(f in _RE_FLAGS for f in flags), flags | |||
| if literal.type == 'STRING' and '\n' in v: | |||
| raise GrammarError('You cannot put newlines in string literals') | |||
| if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: | |||
| raise GrammarError('You can only use newlines in regular expressions ' | |||
| 'with the `x` (verbose) flag') | |||
| v = v[:flag_start] | |||
| assert v[0] == v[-1] and v[0] in '"/' | |||
| x = v[1:-1] | |||
| @@ -417,9 +424,11 @@ def _literal_to_pattern(literal): | |||
| if literal.type == 'STRING': | |||
| s = s.replace('\\\\', '\\') | |||
| return { 'STRING': PatternStr, | |||
| 'REGEXP': PatternRE }[literal.type](s, flags) | |||
| return PatternStr(s, flags) | |||
| elif literal.type == 'REGEXP': | |||
| return PatternRE(s, flags) | |||
| else: | |||
| assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' | |||
| @inline_args | |||
| @@ -841,7 +850,7 @@ class GrammarLoader: | |||
| if len(stmt.children) > 1: | |||
| path_node, arg1 = stmt.children | |||
| else: | |||
| path_node, = stmt.children | |||
| path_node ,= stmt.children | |||
| arg1 = None | |||
| if isinstance(arg1, Tree): # Multi import | |||
| @@ -59,10 +59,10 @@ class _Parser: | |||
| try: | |||
| return states[state][token.type] | |||
| except KeyError: | |||
| expected = [s for s in states[state].keys() if s.isupper()] | |||
| expected = {s for s in states[state].keys() if s.isupper()} | |||
| try: | |||
| puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | |||
| except NameError: | |||
| except NameError: # For standalone parser | |||
| puppet = None | |||
| raise UnexpectedToken(token, expected, state=state, puppet=puppet) | |||
| @@ -3,8 +3,10 @@ | |||
| from copy import deepcopy | |||
| from .lalr_analysis import Shift, Reduce | |||
| from .. import Token | |||
| class ParserPuppet: | |||
| class ParserPuppet(object): | |||
| def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | |||
| self.parser = parser | |||
| self._state_stack = state_stack | |||
| @@ -16,7 +18,7 @@ class ParserPuppet: | |||
| self.result = None | |||
| def feed_token(self, token): | |||
| """Advance the parser state, as if it just recieved `token` from the lexer | |||
| """Advance the parser state, as if it just received `token` from the lexer | |||
| """ | |||
| end_state = self.parser.parse_table.end_states[self._start] | |||
| @@ -66,14 +68,27 @@ class ParserPuppet: | |||
| self._set_state, | |||
| ) | |||
| def pretty(): | |||
| print("Puppet choices:") | |||
| for k, v in self.choices.items(): | |||
| print('\t-', k, '->', v) | |||
| print('stack size:', len(self._state_stack)) | |||
| def pretty(self): | |||
| out = ["Puppet choices:"] | |||
| for k, v in self.choices().items(): | |||
| out.append('\t- %s -> %s' % (k, v)) | |||
| out.append('stack size: %s' % len(self._state_stack)) | |||
| return '\n'.join(out) | |||
| def choices(self): | |||
| return self.parser.parse_table.states[self._state_stack[-1]] | |||
| def accepts(self): | |||
| accepts = set() | |||
| for t in self.choices(): | |||
| new_puppet = self.copy() | |||
| try: | |||
| new_puppet.feed_token(Token(t, '')) | |||
| except KeyError: | |||
| pass | |||
| else: | |||
| accepts.add(t) | |||
| return accepts | |||
| def resume_parse(self): | |||
| return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) | |||
| @@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER): | |||
| tree = l.parse('aA') | |||
| self.assertEqual(tree.children, ['a', 'A']) | |||
| def test_token_flags_verbose(self): | |||
| g = _Lark(r"""start: NL | ABC | |||
| ABC: / [a-z] /x | |||
| NL: /\n/ | |||
| """) | |||
| x = g.parse('a') | |||
| self.assertEqual(x.children, ['a']) | |||
| def test_token_flags_verbose_multiline(self): | |||
| g = _Lark(r"""start: ABC | |||
| ABC: / a b c | |||
| d | |||
| e f | |||
| /x | |||
| """) | |||
| x = g.parse('abcdef') | |||
| self.assertEqual(x.children, ['abcdef']) | |||
| def test_token_multiline_only_works_with_x_flag(self): | |||
| g = r"""start: ABC | |||
| ABC: / a b c | |||
| d | |||
| e f | |||
| /i | |||
| """ | |||
| self.assertRaises( GrammarError, _Lark, g) | |||
| @unittest.skipIf(PARSER == 'cyk', "No empty rules") | |||
| def test_twice_empty(self): | |||