rename LexerConf.terminals to LexerConf.tokens Make Exception message generation lazy Made a few classes new-styletags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
| @@ -1,14 +1,16 @@ | |||
| from warnings import warn | |||
| from .utils import Serialize | |||
| from .lexer import TerminalDef | |||
| ###{standalone | |||
| class LexerConf(Serialize): | |||
| __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' | |||
| __serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes' | |||
| __serialize_namespace__ = TerminalDef, | |||
| def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||
| self.tokens = tokens # TODO should be terminals | |||
| def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||
| self.terminals = terminals | |||
| self.ignore = ignore | |||
| self.postlex = postlex | |||
| self.callbacks = callbacks or {} | |||
| @@ -16,6 +18,11 @@ class LexerConf(Serialize): | |||
| self.re_module = re_module | |||
| self.skip_validation = skip_validation | |||
| self.use_bytes = use_bytes | |||
| @property | |||
| def tokens(self): | |||
| warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) | |||
| return self.terminals | |||
| ###} | |||
| @@ -34,6 +34,7 @@ class UnexpectedInput(LarkError): | |||
| After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | |||
| """ | |||
| pos_in_stream = None | |||
| _all_terminals = None | |||
| def get_context(self, text, span=40): | |||
| """Returns a pretty string pinpointing the error in the text, | |||
| @@ -109,32 +110,54 @@ class UnexpectedInput(LarkError): | |||
| candidate = label, False | |||
| return candidate[0] | |||
| def _format_terminals(self, names): | |||
| if self._all_terminals: | |||
| t = [] | |||
| for name in names: | |||
| try: | |||
| t.append(next(t.nice_print for t in self._all_terminals if t.name == name)) | |||
| except StopIteration: | |||
| # If we don't find the corresponding Terminal (which *should* never happen), don't error. | |||
| # Broken __str__ for Exception are some of the worst bugs | |||
| t.append(t.display_name) | |||
| else: | |||
| t = names | |||
| return "Expected one of: \n\t* %s\n" % '\n\t* '.join(t) | |||
| class UnexpectedCharacters(LexError, UnexpectedInput): | |||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, _all_terminals=None): | |||
| self.line = line | |||
| self.column = column | |||
| self.pos_in_stream = lex_pos | |||
| self.state = state | |||
| self._all_terminals = _all_terminals | |||
| self.allowed = allowed | |||
| self.considered_tokens = considered_tokens | |||
| self.token_history = token_history | |||
| if isinstance(seq, bytes): | |||
| _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||
| self._s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||
| else: | |||
| _s = seq[lex_pos] | |||
| message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) | |||
| message += '\n\n' + self.get_context(seq) | |||
| if allowed: | |||
| message += '\nExpecting: %s\n' % allowed | |||
| if token_history: | |||
| message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) | |||
| super(UnexpectedCharacters, self).__init__(message) | |||
| self._s = seq[lex_pos] | |||
| self._context = self.get_context(seq) | |||
| super(UnexpectedCharacters, self).__init__() | |||
| def __str__(self): | |||
| # Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors | |||
| # You will get just `UnexpectedCharacters: <str() failed>` or something like that | |||
| # If you run into this, add an `except Exception as e: print(e); raise e` or similar. | |||
| message = "No terminal defined for '%s' at line %d col %d" % (self._s, self.line, self.column) | |||
| message += '\n\n' + self._context | |||
| if self.allowed: | |||
| message += self._format_terminals(self.allowed) | |||
| if self.token_history: | |||
| message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) | |||
| return message | |||
| class UnexpectedToken(ParseError, UnexpectedInput): | |||
| """When the parser throws UnexpectedToken, it instantiates a puppet | |||
| @@ -143,7 +166,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
| see: :ref:`ParserPuppet`. | |||
| """ | |||
| def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | |||
| def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, all_terminals=None): | |||
| self.line = getattr(token, 'line', '?') | |||
| self.column = getattr(token, 'column', '?') | |||
| self.pos_in_stream = getattr(token, 'pos_in_stream', None) | |||
| @@ -153,16 +176,20 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
| self.expected = expected # XXX deprecate? `accepts` is better | |||
| self.considered_rules = considered_rules | |||
| self.puppet = puppet | |||
| # TODO Only calculate `accepts()` when we need to display it to the user | |||
| # This will improve performance when doing automatic error handling | |||
| self.accepts = puppet and puppet.accepts() | |||
| message = ("Unexpected token %r at line %s, column %s.\n" | |||
| "Expected one of: \n\t* %s\n" | |||
| % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||
| super(UnexpectedToken, self).__init__(message) | |||
| self._all_terminals = all_terminals | |||
| super(UnexpectedToken, self).__init__() | |||
| @property | |||
| def accepts(self): | |||
| return self.puppet and self.puppet.accepts() | |||
| def __str__(self): | |||
| # Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors | |||
| message = ("Unexpected token %r at line %s, column %s.\n%s" | |||
| % (self.token, self.line, self.column, self._format_terminals(self.accepts or self.expected))) | |||
| return message | |||
| class VisitError(LarkError): | |||
| @@ -402,7 +402,7 @@ class Lark(Serialize): | |||
| self._callbacks, | |||
| self.options, # Not all, but multiple attributes are used | |||
| ) | |||
| self.terminals = self.parser.lexer_conf.tokens | |||
| self.terminals = self.parser.lexer_conf.terminals | |||
| self._terminals_dict = {t.name: t for t in self.terminals} | |||
| return self | |||
| @@ -76,14 +76,15 @@ class PatternRE(Pattern): | |||
| class TerminalDef(Serialize): | |||
| __serialize_fields__ = 'name', 'pattern', 'priority' | |||
| __serialize_fields__ = 'name', 'pattern', 'priority', 'nice_print' | |||
| __serialize_namespace__ = PatternStr, PatternRE | |||
| def __init__(self, name, pattern, priority=1): | |||
| def __init__(self, name, pattern, priority=1, nice_print=None): | |||
| assert isinstance(pattern, Pattern), pattern | |||
| self.name = name | |||
| self.pattern = pattern | |||
| self.priority = priority | |||
| self.nice_print = nice_print or name | |||
| def __repr__(self): | |||
| return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | |||
| @@ -278,7 +279,7 @@ class Lexer(object): | |||
| class TraditionalLexer(Lexer): | |||
| def __init__(self, conf): | |||
| terminals = list(conf.tokens) | |||
| terminals = list(conf.terminals) | |||
| assert all(isinstance(t, TerminalDef) for t in terminals), terminals | |||
| self.re = conf.re_module | |||
| @@ -347,7 +348,8 @@ class TraditionalLexer(Lexer): | |||
| if not allowed: | |||
| allowed = {"<END-OF-FILE>"} | |||
| raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | |||
| allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) | |||
| allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], | |||
| _all_terminals=self.terminals) | |||
| value, type_ = res | |||
| @@ -386,14 +388,14 @@ class LexerState: | |||
| class ContextualLexer(Lexer): | |||
| def __init__(self, conf, states, always_accept=()): | |||
| terminals = list(conf.tokens) | |||
| terminals = list(conf.terminals) | |||
| tokens_by_name = {} | |||
| for t in terminals: | |||
| assert t.name not in tokens_by_name, t | |||
| tokens_by_name[t.name] = t | |||
| trad_conf = copy(conf) | |||
| trad_conf.tokens = terminals | |||
| trad_conf.terminals = terminals | |||
| lexer_by_tokens = {} | |||
| self.lexers = {} | |||
| @@ -405,13 +407,13 @@ class ContextualLexer(Lexer): | |||
| accepts = set(accepts) | set(conf.ignore) | set(always_accept) | |||
| state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | |||
| lexer_conf = copy(trad_conf) | |||
| lexer_conf.tokens = state_tokens | |||
| lexer_conf.terminals = state_tokens | |||
| lexer = TraditionalLexer(lexer_conf) | |||
| lexer_by_tokens[key] = lexer | |||
| self.lexers[state] = lexer | |||
| assert trad_conf.tokens is terminals | |||
| assert trad_conf.terminals is terminals | |||
| self.root_lexer = TraditionalLexer(trad_conf) | |||
| def make_lexer_state(self, text): | |||
| @@ -428,7 +430,7 @@ class ContextualLexer(Lexer): | |||
| # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | |||
| # This tests the input against the global context, to provide a nicer error. | |||
| token = self.root_lexer.next_token(lexer_state) | |||
| raise UnexpectedToken(token, e.allowed, state=parser_state.position) | |||
| raise UnexpectedToken(token, e.allowed, state=parser_state.position, all_terminals=self.root_lexer.terminals) | |||
| class LexerThread: | |||
| "A thread that ties a lexer instance and a lexer state, to be used by the parser" | |||
| @@ -5,7 +5,7 @@ import sys | |||
| from copy import copy, deepcopy | |||
| from io import open | |||
| from .utils import bfs, eval_escaping, Py36, logger, classify_bool | |||
| from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isascii | |||
| from .lexer import Token, TerminalDef, PatternStr, PatternRE | |||
| from .parse_tree_builder import ParseTreeBuilder | |||
| @@ -317,8 +317,11 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
| raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) | |||
| term_name = None | |||
| nice_print = None | |||
| if isinstance(p, PatternStr): | |||
| nice_print = repr(value) # This will always be ok, independent of what term_name we end up using | |||
| # TODO: potentially try to get the actual source code, and not the repr | |||
| try: | |||
| # If already defined, use the user-defined terminal name | |||
| term_name = self.term_reverse[p].name | |||
| @@ -327,15 +330,14 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
| try: | |||
| term_name = _TERMINAL_NAMES[value] | |||
| except KeyError: | |||
| if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set: | |||
| with suppress(UnicodeEncodeError): | |||
| value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names | |||
| term_name = value.upper() | |||
| if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set and isascii(value): | |||
| term_name = value.upper() | |||
| if term_name in self.term_set: | |||
| term_name = None | |||
| elif isinstance(p, PatternRE): | |||
| #TODO: generate nice_print | |||
| if p in self.term_reverse: # Kind of a weird placement.name | |||
| term_name = self.term_reverse[p].name | |||
| else: | |||
| @@ -348,7 +350,7 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
| if term_name not in self.term_set: | |||
| assert p not in self.term_reverse | |||
| self.term_set.add(term_name) | |||
| termdef = TerminalDef(term_name, p) | |||
| termdef = TerminalDef(term_name, p, nice_print=nice_print) | |||
| self.term_reverse[p] = termdef | |||
| self.terminals.append(termdef) | |||
| @@ -6,6 +6,7 @@ from .parsers.lalr_parser import LALR_Parser | |||
| from .grammar import Rule | |||
| from .tree import Tree | |||
| from .common import LexerConf | |||
| from .exceptions import UnexpectedInput | |||
| try: | |||
| import regex | |||
| except ImportError: | |||
| @@ -135,7 +136,12 @@ class WithLexer(_ParserFrontend): | |||
| return LexerThread(lexer, text) | |||
| def parse(self, text, start=None): | |||
| return self._parse(start, self.make_lexer(text)) | |||
| try: | |||
| return self._parse(start, self.make_lexer(text)) | |||
| except UnexpectedInput as e: | |||
| if e._all_terminals is None: | |||
| e._all_terminals = self.lexer_conf.terminals | |||
| raise e | |||
| def init_traditional_lexer(self): | |||
| self.lexer = TraditionalLexer(self.lexer_conf) | |||
| @@ -190,7 +196,7 @@ class Earley(WithLexer): | |||
| class XEarley(_ParserFrontend): | |||
| def __init__(self, lexer_conf, parser_conf, options=None, **kw): | |||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
| self.token_by_name = {t.name:t for t in lexer_conf.terminals} | |||
| self.start = parser_conf.start | |||
| self._prepare_match(lexer_conf) | |||
| @@ -211,7 +217,7 @@ class XEarley(_ParserFrontend): | |||
| def _prepare_match(self, lexer_conf): | |||
| self.regexps = {} | |||
| for t in lexer_conf.tokens: | |||
| for t in lexer_conf.terminals: | |||
| if t.priority != 1: | |||
| raise ValueError("Dynamic Earley doesn't support weights on terminals", t, t.priority) | |||
| regexp = t.pattern.to_regexp() | |||
| @@ -228,7 +234,12 @@ class XEarley(_ParserFrontend): | |||
| self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | |||
| def parse(self, text, start): | |||
| return self._parse(start, text) | |||
| try: | |||
| return self._parse(start, text) | |||
| except UnexpectedInput as e: | |||
| if e._all_terminals is None: | |||
| e._all_terminals = self.token_by_name.values() | |||
| raise e | |||
| class XEarley_CompleteLex(XEarley): | |||
| def __init__(self, *args, **kw): | |||
| @@ -35,7 +35,7 @@ class LALR_Parser(object): | |||
| return self.parser.parse(*args) | |||
| class ParseConf: | |||
| class ParseConf(object): | |||
| __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | |||
| def __init__(self, parse_table, callbacks, start): | |||
| @@ -49,7 +49,7 @@ class ParseConf: | |||
| self.start = start | |||
| class ParserState: | |||
| class ParserState(object): | |||
| __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | |||
| def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | |||
| @@ -117,7 +117,7 @@ class ParserState: | |||
| if is_end and state_stack[-1] == end_state: | |||
| return value_stack[-1] | |||
| class _Parser: | |||
| class _Parser(object): | |||
| def __init__(self, parse_table, callbacks, debug=False): | |||
| self.parse_table = parse_table | |||
| self.callbacks = callbacks | |||
| @@ -60,7 +60,7 @@ class ParserPuppet(object): | |||
| Updated by ``feed_token()``. | |||
| """ | |||
| return self.parser_state.parse_table.states[self.parser_state.position] | |||
| return self.parser_state.parse_conf.parse_table.states[self.parser_state.position] | |||
| def accepts(self): | |||
| accepts = set() | |||