| @@ -52,7 +52,7 @@ def parse(json_text): | |||||
| '[1,2,]', | '[1,2,]', | ||||
| '{"foo":1,}', | '{"foo":1,}', | ||||
| '{"foo":false,"bar":true,}'] | '{"foo":false,"bar":true,}'] | ||||
| }) | |||||
| }, use_accepts=True) | |||||
| if not exc_class: | if not exc_class: | ||||
| raise | raise | ||||
| raise exc_class(u.get_context(json_text), u.line, u.column) | raise exc_class(u.get_context(json_text), u.line, u.column) | ||||
| @@ -1,9 +1,9 @@ | |||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||
| from typing import Dict, Iterable, Callable, Union | |||||
| from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set | |||||
| from .tree import Tree | from .tree import Tree | ||||
| from .lexer import Token | from .lexer import Token | ||||
| from .parsers.lalr_puppet import ParserPuppet | |||||
| class LarkError(Exception): | class LarkError(Exception): | ||||
| pass | pass | ||||
| @@ -21,27 +21,37 @@ class LexError(LarkError): | |||||
| pass | pass | ||||
| T = TypeVar('T') | |||||
| class UnexpectedInput(LarkError): | class UnexpectedInput(LarkError): | ||||
| line: int | |||||
| column: int | |||||
| pos_in_stream: int | pos_in_stream: int | ||||
| state: Any | |||||
| def get_context(self, text: str, span: int = ...): | def get_context(self, text: str, span: int = ...): | ||||
| ... | ... | ||||
| def match_examples( | def match_examples( | ||||
| self, | |||||
| parse_fn: Callable[[str], Tree], | |||||
| examples: Dict[str, Iterable[str]] | |||||
| ): | |||||
| self, | |||||
| parse_fn: Callable[[str], Tree], | |||||
| examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], | |||||
| token_type_match_fallback: bool = False, | |||||
| use_accepts: bool = False, | |||||
| ) -> T: | |||||
| ... | ... | ||||
| class UnexpectedToken(ParseError, UnexpectedInput): | class UnexpectedToken(ParseError, UnexpectedInput): | ||||
| pass | |||||
| expected: Set[str] | |||||
| considered_rules: Set[str] | |||||
| puppet: ParserPuppet | |||||
| accepts: Set[str] | |||||
| class UnexpectedCharacters(LexError, UnexpectedInput): | class UnexpectedCharacters(LexError, UnexpectedInput): | ||||
| line: int | |||||
| column: int | |||||
| allowed: Set[str] | |||||
| considered_tokens: Set[Any] | |||||
| class VisitError(LarkError): | class VisitError(LarkError): | ||||
| @@ -0,0 +1,22 @@ | |||||
| from typing import Set, Dict, Any | |||||
| from lark import Token, Tree | |||||
| class ParserPuppet(object): | |||||
| """ | |||||
| Provides an interface to interactively step through the parser (LALR(1) only for now) | |||||
| Accessible via `UnexpectedToken.puppet` (raised by the parser on token error) | |||||
| """ | |||||
| def feed_token(self, token: Token): ... | |||||
| def copy(self) -> ParserPuppet: ... | |||||
| def pretty(self) -> str: ... | |||||
| def choices(self) -> Dict[str, Any]: ... | |||||
| def accepts(self) -> Set[str]: ... | |||||
| def resume_parse(self) -> Tree: ... | |||||
| @@ -1,3 +1,5 @@ | |||||
| import logging | |||||
| from .utils import STRING_TYPE | from .utils import STRING_TYPE | ||||
| ###{standalone | ###{standalone | ||||
| @@ -37,34 +39,46 @@ class UnexpectedInput(LarkError): | |||||
| after = text[pos:end].split(b'\n', 1)[0] | after = text[pos:end].split(b'\n', 1)[0] | ||||
| return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") | return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") | ||||
| def match_examples(self, parse_fn, examples, token_type_match_fallback=False): | |||||
| def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): | |||||
| """ Given a parser instance and a dictionary mapping some label with | """ Given a parser instance and a dictionary mapping some label with | ||||
| some malformed syntax examples, it'll return the label for the | some malformed syntax examples, it'll return the label for the | ||||
| example that bests matches the current error. | example that bests matches the current error. | ||||
| It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. | |||||
| """ | """ | ||||
| assert self.state is not None, "Not supported for this exception" | assert self.state is not None, "Not supported for this exception" | ||||
| if isinstance(examples, dict): | |||||
| examples = examples.items() | |||||
| candidate = (None, False) | candidate = (None, False) | ||||
| for label, example in examples.items(): | |||||
| for i, (label, example) in enumerate(examples): | |||||
| assert not isinstance(example, STRING_TYPE) | assert not isinstance(example, STRING_TYPE) | ||||
| for malformed in example: | |||||
| for j, malformed in enumerate(example): | |||||
| try: | try: | ||||
| parse_fn(malformed) | parse_fn(malformed) | ||||
| except UnexpectedInput as ut: | except UnexpectedInput as ut: | ||||
| if ut.state == self.state: | if ut.state == self.state: | ||||
| if use_accepts and ut.accepts != self.accepts: | |||||
| logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % | |||||
| (self.state, self.accepts, ut.accepts, i, j)) | |||||
| continue | |||||
| try: | try: | ||||
| if ut.token == self.token: # Try exact match first | if ut.token == self.token: # Try exact match first | ||||
| logging.debug("Exact Match at example [%s][%s]" % (i, j)) | |||||
| return label | return label | ||||
| if token_type_match_fallback: | if token_type_match_fallback: | ||||
| # Fallback to token types match | # Fallback to token types match | ||||
| if (ut.token.type == self.token.type) and not candidate[-1]: | if (ut.token.type == self.token.type) and not candidate[-1]: | ||||
| logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) | |||||
| candidate = label, True | candidate = label, True | ||||
| except AttributeError: | except AttributeError: | ||||
| pass | pass | ||||
| if not candidate[0]: | if not candidate[0]: | ||||
| logging.debug("Same State match at example [%s][%s]" % (i, j)) | |||||
| candidate = label, False | candidate = label, False | ||||
| return candidate[0] | return candidate[0] | ||||
| @@ -72,19 +86,20 @@ class UnexpectedInput(LarkError): | |||||
| class UnexpectedCharacters(LexError, UnexpectedInput): | class UnexpectedCharacters(LexError, UnexpectedInput): | ||||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | ||||
| if isinstance(seq, bytes): | |||||
| message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) | |||||
| else: | |||||
| message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) | |||||
| self.line = line | self.line = line | ||||
| self.column = column | self.column = column | ||||
| self.allowed = allowed | |||||
| self.considered_tokens = considered_tokens | |||||
| self.pos_in_stream = lex_pos | self.pos_in_stream = lex_pos | ||||
| self.state = state | self.state = state | ||||
| self.allowed = allowed | |||||
| self.considered_tokens = considered_tokens | |||||
| if isinstance(seq, bytes): | |||||
| _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||||
| else: | |||||
| _s = seq[lex_pos] | |||||
| message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) | |||||
| message += '\n\n' + self.get_context(seq) | message += '\n\n' + self.get_context(seq) | ||||
| if allowed: | if allowed: | ||||
| message += '\nExpecting: %s\n' % allowed | message += '\nExpecting: %s\n' % allowed | ||||
| @@ -97,18 +112,23 @@ class UnexpectedCharacters(LexError, UnexpectedInput): | |||||
| class UnexpectedToken(ParseError, UnexpectedInput): | class UnexpectedToken(ParseError, UnexpectedInput): | ||||
| def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | ||||
| self.token = token | |||||
| self.expected = expected # XXX str shouldn't necessary | |||||
| self.line = getattr(token, 'line', '?') | self.line = getattr(token, 'line', '?') | ||||
| self.column = getattr(token, 'column', '?') | self.column = getattr(token, 'column', '?') | ||||
| self.considered_rules = considered_rules | |||||
| self.state = state | |||||
| self.pos_in_stream = getattr(token, 'pos_in_stream', None) | self.pos_in_stream = getattr(token, 'pos_in_stream', None) | ||||
| self.state = state | |||||
| self.token = token | |||||
| self.expected = expected # XXX deprecate? `accepts` is better | |||||
| self.considered_rules = considered_rules | |||||
| self.puppet = puppet | self.puppet = puppet | ||||
| # TODO Only calculate `accepts()` when we need to display it to the user | |||||
| # This will improve performance when doing automatic error handling | |||||
| self.accepts = puppet and puppet.accepts() | |||||
| message = ("Unexpected token %r at line %s, column %s.\n" | message = ("Unexpected token %r at line %s, column %s.\n" | ||||
| "Expected one of: \n\t* %s\n" | "Expected one of: \n\t* %s\n" | ||||
| % (token, self.line, self.column, '\n\t* '.join(self.expected))) | |||||
| % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||||
| super(UnexpectedToken, self).__init__(message) | super(UnexpectedToken, self).__init__(message) | ||||
| @@ -85,7 +85,7 @@ TERMINALS = { | |||||
| 'RULE': '!?[_?]?[a-z][_a-z0-9]*', | 'RULE': '!?[_?]?[a-z][_a-z0-9]*', | ||||
| 'TERMINAL': '_?[A-Z][_A-Z0-9]*', | 'TERMINAL': '_?[A-Z][_A-Z0-9]*', | ||||
| 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', | 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', | ||||
| 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, | |||||
| 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, | |||||
| '_NL': r'(\r?\n)+\s*', | '_NL': r'(\r?\n)+\s*', | ||||
| 'WS': r'[ \t]+', | 'WS': r'[ \t]+', | ||||
| 'COMMENT': r'\s*//[^\n]*', | 'COMMENT': r'\s*//[^\n]*', | ||||
| @@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||||
| term_name = None | term_name = None | ||||
| elif isinstance(p, PatternRE): | elif isinstance(p, PatternRE): | ||||
| if p in self.term_reverse: # Kind of a wierd placement.name | |||||
| if p in self.term_reverse: # Kind of a weird placement.name | |||||
| term_name = self.term_reverse[p].name | term_name = self.term_reverse[p].name | ||||
| else: | else: | ||||
| assert False, p | assert False, p | ||||
| @@ -409,6 +409,13 @@ def _literal_to_pattern(literal): | |||||
| flags = v[flag_start:] | flags = v[flag_start:] | ||||
| assert all(f in _RE_FLAGS for f in flags), flags | assert all(f in _RE_FLAGS for f in flags), flags | ||||
| if literal.type == 'STRING' and '\n' in v: | |||||
| raise GrammarError('You cannot put newlines in string literals') | |||||
| if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: | |||||
| raise GrammarError('You can only use newlines in regular expressions ' | |||||
| 'with the `x` (verbose) flag') | |||||
| v = v[:flag_start] | v = v[:flag_start] | ||||
| assert v[0] == v[-1] and v[0] in '"/' | assert v[0] == v[-1] and v[0] in '"/' | ||||
| x = v[1:-1] | x = v[1:-1] | ||||
| @@ -417,9 +424,11 @@ def _literal_to_pattern(literal): | |||||
| if literal.type == 'STRING': | if literal.type == 'STRING': | ||||
| s = s.replace('\\\\', '\\') | s = s.replace('\\\\', '\\') | ||||
| return { 'STRING': PatternStr, | |||||
| 'REGEXP': PatternRE }[literal.type](s, flags) | |||||
| return PatternStr(s, flags) | |||||
| elif literal.type == 'REGEXP': | |||||
| return PatternRE(s, flags) | |||||
| else: | |||||
| assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' | |||||
| @inline_args | @inline_args | ||||
| @@ -841,7 +850,7 @@ class GrammarLoader: | |||||
| if len(stmt.children) > 1: | if len(stmt.children) > 1: | ||||
| path_node, arg1 = stmt.children | path_node, arg1 = stmt.children | ||||
| else: | else: | ||||
| path_node, = stmt.children | |||||
| path_node ,= stmt.children | |||||
| arg1 = None | arg1 = None | ||||
| if isinstance(arg1, Tree): # Multi import | if isinstance(arg1, Tree): # Multi import | ||||
| @@ -59,10 +59,10 @@ class _Parser: | |||||
| try: | try: | ||||
| return states[state][token.type] | return states[state][token.type] | ||||
| except KeyError: | except KeyError: | ||||
| expected = [s for s in states[state].keys() if s.isupper()] | |||||
| expected = {s for s in states[state].keys() if s.isupper()} | |||||
| try: | try: | ||||
| puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | ||||
| except NameError: | |||||
| except NameError: # For standalone parser | |||||
| puppet = None | puppet = None | ||||
| raise UnexpectedToken(token, expected, state=state, puppet=puppet) | raise UnexpectedToken(token, expected, state=state, puppet=puppet) | ||||
| @@ -3,8 +3,10 @@ | |||||
| from copy import deepcopy | from copy import deepcopy | ||||
| from .lalr_analysis import Shift, Reduce | from .lalr_analysis import Shift, Reduce | ||||
| from .. import Token | |||||
| class ParserPuppet: | |||||
| class ParserPuppet(object): | |||||
| def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | ||||
| self.parser = parser | self.parser = parser | ||||
| self._state_stack = state_stack | self._state_stack = state_stack | ||||
| @@ -16,7 +18,7 @@ class ParserPuppet: | |||||
| self.result = None | self.result = None | ||||
| def feed_token(self, token): | def feed_token(self, token): | ||||
| """Advance the parser state, as if it just recieved `token` from the lexer | |||||
| """Advance the parser state, as if it just received `token` from the lexer | |||||
| """ | """ | ||||
| end_state = self.parser.parse_table.end_states[self._start] | end_state = self.parser.parse_table.end_states[self._start] | ||||
| @@ -66,14 +68,27 @@ class ParserPuppet: | |||||
| self._set_state, | self._set_state, | ||||
| ) | ) | ||||
| def pretty(): | |||||
| print("Puppet choices:") | |||||
| for k, v in self.choices.items(): | |||||
| print('\t-', k, '->', v) | |||||
| print('stack size:', len(self._state_stack)) | |||||
| def pretty(self): | |||||
| out = ["Puppet choices:"] | |||||
| for k, v in self.choices().items(): | |||||
| out.append('\t- %s -> %s' % (k, v)) | |||||
| out.append('stack size: %s' % len(self._state_stack)) | |||||
| return '\n'.join(out) | |||||
| def choices(self): | def choices(self): | ||||
| return self.parser.parse_table.states[self._state_stack[-1]] | return self.parser.parse_table.states[self._state_stack[-1]] | ||||
| def accepts(self): | |||||
| accepts = set() | |||||
| for t in self.choices(): | |||||
| new_puppet = self.copy() | |||||
| try: | |||||
| new_puppet.feed_token(Token(t, '')) | |||||
| except KeyError: | |||||
| pass | |||||
| else: | |||||
| accepts.add(t) | |||||
| return accepts | |||||
| def resume_parse(self): | def resume_parse(self): | ||||
| return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) | return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) | ||||
| @@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER): | |||||
| tree = l.parse('aA') | tree = l.parse('aA') | ||||
| self.assertEqual(tree.children, ['a', 'A']) | self.assertEqual(tree.children, ['a', 'A']) | ||||
| def test_token_flags_verbose(self): | |||||
| g = _Lark(r"""start: NL | ABC | |||||
| ABC: / [a-z] /x | |||||
| NL: /\n/ | |||||
| """) | |||||
| x = g.parse('a') | |||||
| self.assertEqual(x.children, ['a']) | |||||
| def test_token_flags_verbose_multiline(self): | |||||
| g = _Lark(r"""start: ABC | |||||
| ABC: / a b c | |||||
| d | |||||
| e f | |||||
| /x | |||||
| """) | |||||
| x = g.parse('abcdef') | |||||
| self.assertEqual(x.children, ['abcdef']) | |||||
| def test_token_multiline_only_works_with_x_flag(self): | |||||
| g = r"""start: ABC | |||||
| ABC: / a b c | |||||
| d | |||||
| e f | |||||
| /i | |||||
| """ | |||||
| self.assertRaises( GrammarError, _Lark, g) | |||||
| @unittest.skipIf(PARSER == 'cyk', "No empty rules") | @unittest.skipIf(PARSER == 'cyk', "No empty rules") | ||||
| def test_twice_empty(self): | def test_twice_empty(self): | ||||