From 605b91e4be761b6107f36f4c9f01f987aa07bd0e Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Tue, 20 Oct 2020 22:29:56 +0200 Subject: [PATCH 1/4] improve error message with token source. rename LexerConf.terminals to LexerConf.tokens Make Exception message generation lazy Made a few classes new-style --- lark/common.py | 13 +++++-- lark/exceptions.py | 75 +++++++++++++++++++++++++------------ lark/lark.py | 2 +- lark/lexer.py | 20 +++++----- lark/load_grammar.py | 14 ++++--- lark/parser_frontends.py | 19 ++++++++-- lark/parsers/lalr_parser.py | 6 +-- lark/parsers/lalr_puppet.py | 2 +- 8 files changed, 100 insertions(+), 51 deletions(-) diff --git a/lark/common.py b/lark/common.py index 714399a..ad6dbc2 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,14 +1,16 @@ +from warnings import warn + from .utils import Serialize from .lexer import TerminalDef ###{standalone class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' + __serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): - self.tokens = tokens # TODO should be terminals + def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): + self.terminals = terminals self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} @@ -16,6 +18,11 @@ class LexerConf(Serialize): self.re_module = re_module self.skip_validation = skip_validation self.use_bytes = use_bytes + + @property + def tokens(self): + warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) + return self.terminals ###} diff --git a/lark/exceptions.py b/lark/exceptions.py index 79629e6..ec7e729 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -34,6 +34,7 @@ class UnexpectedInput(LarkError): After catching one of these exceptions, you may call the following helper methods to create a nicer error message. """ pos_in_stream = None + _all_terminals = None def get_context(self, text, span=40): """Returns a pretty string pinpointing the error in the text, @@ -109,32 +110,54 @@ class UnexpectedInput(LarkError): candidate = label, False return candidate[0] + + def _format_terminals(self, names): + if self._all_terminals: + t = [] + for name in names: + try: + t.append(next(t.nice_print for t in self._all_terminals if t.name == name)) + except StopIteration: + # If we don't find the corresponding Terminal (which *should* never happen), don't error. + # Broken __str__ for Exception are some of the worst bugs + t.append(t.display_name) + else: + t = names + return "Expected one of: \n\t* %s\n" % '\n\t* '.join(t) + class UnexpectedCharacters(LexError, UnexpectedInput): - def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, _all_terminals=None): self.line = line self.column = column self.pos_in_stream = lex_pos self.state = state + self._all_terminals = _all_terminals self.allowed = allowed self.considered_tokens = considered_tokens + self.token_history = token_history if isinstance(seq, bytes): - _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") + self._s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") else: - _s = seq[lex_pos] - - message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) - message += '\n\n' + self.get_context(seq) - if allowed: - message += '\nExpecting: %s\n' % allowed - if token_history: - message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) - - super(UnexpectedCharacters, self).__init__(message) - + self._s = seq[lex_pos] + self._context = self.get_context(seq) + + super(UnexpectedCharacters, self).__init__() + + def __str__(self): + # Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors + # You will get just `UnexpectedCharacters: ` or something like that + # If you run into this, add an `except Exception as e: print(e); raise e` or similar. + message = "No terminal defined for '%s' at line %d col %d" % (self._s, self.line, self.column) + message += '\n\n' + self._context + if self.allowed: + message += self._format_terminals(self.allowed) + if self.token_history: + message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) + return message class UnexpectedToken(ParseError, UnexpectedInput): """When the parser throws UnexpectedToken, it instantiates a puppet @@ -143,7 +166,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): see: :ref:`ParserPuppet`. """ - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, all_terminals=None): self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') self.pos_in_stream = getattr(token, 'pos_in_stream', None) @@ -153,16 +176,20 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.expected = expected # XXX deprecate? `accepts` is better self.considered_rules = considered_rules self.puppet = puppet - - # TODO Only calculate `accepts()` when we need to display it to the user - # This will improve performance when doing automatic error handling - self.accepts = puppet and puppet.accepts() - - message = ("Unexpected token %r at line %s, column %s.\n" - "Expected one of: \n\t* %s\n" - % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) - - super(UnexpectedToken, self).__init__(message) + self._all_terminals = all_terminals + + + super(UnexpectedToken, self).__init__() + + @property + def accepts(self): + return self.puppet and self.puppet.accepts() + + def __str__(self): + # Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors + message = ("Unexpected token %r at line %s, column %s.\n%s" + % (self.token, self.line, self.column, self._format_terminals(self.accepts or self.expected))) + return message class VisitError(LarkError): diff --git a/lark/lark.py b/lark/lark.py index bc34eb4..f3fa8dc 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -402,7 +402,7 @@ class Lark(Serialize): self._callbacks, self.options, # Not all, but multiple attributes are used ) - self.terminals = self.parser.lexer_conf.tokens + self.terminals = self.parser.lexer_conf.terminals self._terminals_dict = {t.name: t for t in self.terminals} return self diff --git a/lark/lexer.py b/lark/lexer.py index b080921..e379021 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -76,14 +76,15 @@ class PatternRE(Pattern): class TerminalDef(Serialize): - __serialize_fields__ = 'name', 'pattern', 'priority' + __serialize_fields__ = 'name', 'pattern', 'priority', 'nice_print' __serialize_namespace__ = PatternStr, PatternRE - def __init__(self, name, pattern, priority=1): + def __init__(self, name, pattern, priority=1, nice_print=None): assert isinstance(pattern, Pattern), pattern self.name = name self.pattern = pattern self.priority = priority + self.nice_print = nice_print or name def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) @@ -278,7 +279,7 @@ class Lexer(object): class TraditionalLexer(Lexer): def __init__(self, conf): - terminals = list(conf.tokens) + terminals = list(conf.terminals) assert all(isinstance(t, TerminalDef) for t in terminals), terminals self.re = conf.re_module @@ -347,7 +348,8 @@ class TraditionalLexer(Lexer): if not allowed: allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, - allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) + allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], + _all_terminals=self.terminals) value, type_ = res @@ -386,14 +388,14 @@ class LexerState: class ContextualLexer(Lexer): def __init__(self, conf, states, always_accept=()): - terminals = list(conf.tokens) + terminals = list(conf.terminals) tokens_by_name = {} for t in terminals: assert t.name not in tokens_by_name, t tokens_by_name[t.name] = t trad_conf = copy(conf) - trad_conf.tokens = terminals + trad_conf.terminals = terminals lexer_by_tokens = {} self.lexers = {} @@ -405,13 +407,13 @@ class ContextualLexer(Lexer): accepts = set(accepts) | set(conf.ignore) | set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] lexer_conf = copy(trad_conf) - lexer_conf.tokens = state_tokens + lexer_conf.terminals = state_tokens lexer = TraditionalLexer(lexer_conf) lexer_by_tokens[key] = lexer self.lexers[state] = lexer - assert trad_conf.tokens is terminals + assert trad_conf.terminals is terminals self.root_lexer = TraditionalLexer(trad_conf) def make_lexer_state(self, text): @@ -428,7 +430,7 @@ class ContextualLexer(Lexer): # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. # This tests the input against the global context, to provide a nicer error. token = self.root_lexer.next_token(lexer_state) - raise UnexpectedToken(token, e.allowed, state=parser_state.position) + raise UnexpectedToken(token, e.allowed, state=parser_state.position, all_terminals=self.root_lexer.terminals) class LexerThread: "A thread that ties a lexer instance and a lexer state, to be used by the parser" diff --git a/lark/load_grammar.py b/lark/load_grammar.py index d039638..8bb4198 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -5,7 +5,7 @@ import sys from copy import copy, deepcopy from io import open -from .utils import bfs, eval_escaping, Py36, logger, classify_bool +from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isascii from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -317,8 +317,11 @@ class PrepareAnonTerminals(Transformer_InPlace): raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) term_name = None + nice_print = None if isinstance(p, PatternStr): + nice_print = repr(value) # This will always be ok, independent of what term_name we end up using + # TODO: potentially try to get the actual source code, and not the repr try: # If already defined, use the user-defined terminal name term_name = self.term_reverse[p].name @@ -327,15 +330,14 @@ class PrepareAnonTerminals(Transformer_InPlace): try: term_name = _TERMINAL_NAMES[value] except KeyError: - if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set: - with suppress(UnicodeEncodeError): - value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names - term_name = value.upper() + if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set and isascii(value): + term_name = value.upper() if term_name in self.term_set: term_name = None elif isinstance(p, PatternRE): + #TODO: generate nice_print if p in self.term_reverse: # Kind of a weird placement.name term_name = self.term_reverse[p].name else: @@ -348,7 +350,7 @@ class PrepareAnonTerminals(Transformer_InPlace): if term_name not in self.term_set: assert p not in self.term_reverse self.term_set.add(term_name) - termdef = TerminalDef(term_name, p) + termdef = TerminalDef(term_name, p, nice_print=nice_print) self.term_reverse[p] = termdef self.terminals.append(termdef) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 202382b..5bcacf0 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -6,6 +6,7 @@ from .parsers.lalr_parser import LALR_Parser from .grammar import Rule from .tree import Tree from .common import LexerConf +from .exceptions import UnexpectedInput try: import regex except ImportError: @@ -135,7 +136,12 @@ class WithLexer(_ParserFrontend): return LexerThread(lexer, text) def parse(self, text, start=None): - return self._parse(start, self.make_lexer(text)) + try: + return self._parse(start, self.make_lexer(text)) + except UnexpectedInput as e: + if e._all_terminals is None: + e._all_terminals = self.lexer_conf.terminals + raise e def init_traditional_lexer(self): self.lexer = TraditionalLexer(self.lexer_conf) @@ -190,7 +196,7 @@ class Earley(WithLexer): class XEarley(_ParserFrontend): def __init__(self, lexer_conf, parser_conf, options=None, **kw): - self.token_by_name = {t.name:t for t in lexer_conf.tokens} + self.token_by_name = {t.name:t for t in lexer_conf.terminals} self.start = parser_conf.start self._prepare_match(lexer_conf) @@ -211,7 +217,7 @@ class XEarley(_ParserFrontend): def _prepare_match(self, lexer_conf): self.regexps = {} - for t in lexer_conf.tokens: + for t in lexer_conf.terminals: if t.priority != 1: raise ValueError("Dynamic Earley doesn't support weights on terminals", t, t.priority) regexp = t.pattern.to_regexp() @@ -228,7 +234,12 @@ class XEarley(_ParserFrontend): self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) def parse(self, text, start): - return self._parse(start, text) + try: + return self._parse(start, text) + except UnexpectedInput as e: + if e._all_terminals is None: + e._all_terminals = self.token_by_name.values() + raise e class XEarley_CompleteLex(XEarley): def __init__(self, *args, **kw): diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 4fa911f..99b3672 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -35,7 +35,7 @@ class LALR_Parser(object): return self.parser.parse(*args) -class ParseConf: +class ParseConf(object): __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' def __init__(self, parse_table, callbacks, start): @@ -49,7 +49,7 @@ class ParseConf: self.start = start -class ParserState: +class ParserState(object): __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): @@ -117,7 +117,7 @@ class ParserState: if is_end and state_stack[-1] == end_state: return value_stack[-1] -class _Parser: +class _Parser(object): def __init__(self, parse_table, callbacks, debug=False): self.parse_table = parse_table self.callbacks = callbacks diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 95ee3a3..e1496f9 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -60,7 +60,7 @@ class ParserPuppet(object): Updated by ``feed_token()``. """ - return self.parser_state.parse_table.states[self.parser_state.position] + return self.parser_state.parse_conf.parse_table.states[self.parser_state.position] def accepts(self): accepts = set() From bc9ed5376db56b178d7586b7739141ac38fa3040 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Thu, 22 Oct 2020 15:27:32 +0200 Subject: [PATCH 2/4] made error message contain actual source code make _all_terminals a dict added raw attribute to Pattern rename nice_print -> user_repr --- lark/exceptions.py | 12 +++++++----- lark/lexer.py | 9 +++++---- lark/load_grammar.py | 11 ++++------- lark/parser_frontends.py | 4 ++-- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index ec7e729..8bcc855 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -113,17 +113,19 @@ class UnexpectedInput(LarkError): def _format_terminals(self, names): if self._all_terminals: - t = [] + if isinstance(self._all_terminals, list): + self._all_terminals = {t.name: t for t in self._all_terminals} + ts = [] for name in names: try: - t.append(next(t.nice_print for t in self._all_terminals if t.name == name)) + ts.append(self._all_terminals[name].user_repr) except StopIteration: # If we don't find the corresponding Terminal (which *should* never happen), don't error. # Broken __str__ for Exception are some of the worst bugs - t.append(t.display_name) + ts.append(name) else: - t = names - return "Expected one of: \n\t* %s\n" % '\n\t* '.join(t) + ts = names + return "Expected one of: \n\t* %s\n" % '\n\t* '.join(ts) diff --git a/lark/lexer.py b/lark/lexer.py index e379021..3a3a42e 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -10,9 +10,10 @@ from copy import copy class Pattern(Serialize): - def __init__(self, value, flags=()): + def __init__(self, value, flags=(), raw=None): self.value = value self.flags = frozenset(flags) + self.raw = raw def __repr__(self): return repr(self.to_regexp()) @@ -76,15 +77,15 @@ class PatternRE(Pattern): class TerminalDef(Serialize): - __serialize_fields__ = 'name', 'pattern', 'priority', 'nice_print' + __serialize_fields__ = 'name', 'pattern', 'priority', 'user_repr' __serialize_namespace__ = PatternStr, PatternRE - def __init__(self, name, pattern, priority=1, nice_print=None): + def __init__(self, name, pattern, priority=1, user_repr=None): assert isinstance(pattern, Pattern), pattern self.name = name self.pattern = pattern self.priority = priority - self.nice_print = nice_print or name + self.user_repr = user_repr or name def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 8bb4198..413f921 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -317,11 +317,9 @@ class PrepareAnonTerminals(Transformer_InPlace): raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) term_name = None - nice_print = None + user_repr = p.raw # This will always be ok, independent of what term_name we end up using if isinstance(p, PatternStr): - nice_print = repr(value) # This will always be ok, independent of what term_name we end up using - # TODO: potentially try to get the actual source code, and not the repr try: # If already defined, use the user-defined terminal name term_name = self.term_reverse[p].name @@ -337,7 +335,6 @@ class PrepareAnonTerminals(Transformer_InPlace): term_name = None elif isinstance(p, PatternRE): - #TODO: generate nice_print if p in self.term_reverse: # Kind of a weird placement.name term_name = self.term_reverse[p].name else: @@ -350,7 +347,7 @@ class PrepareAnonTerminals(Transformer_InPlace): if term_name not in self.term_set: assert p not in self.term_reverse self.term_set.add(term_name) - termdef = TerminalDef(term_name, p, nice_print=nice_print) + termdef = TerminalDef(term_name, p, user_repr=user_repr) self.term_reverse[p] = termdef self.terminals.append(termdef) @@ -426,9 +423,9 @@ def _literal_to_pattern(literal): if literal.type == 'STRING': s = s.replace('\\\\', '\\') - return PatternStr(s, flags) + return PatternStr(s, flags, raw=literal.value) elif literal.type == 'REGEXP': - return PatternRE(s, flags) + return PatternRE(s, flags, raw=literal.value) else: assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 5bcacf0..739f9b5 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -196,7 +196,7 @@ class Earley(WithLexer): class XEarley(_ParserFrontend): def __init__(self, lexer_conf, parser_conf, options=None, **kw): - self.token_by_name = {t.name:t for t in lexer_conf.terminals} + self.terminals_by_name = {t.name:t for t in lexer_conf.terminals} self.start = parser_conf.start self._prepare_match(lexer_conf) @@ -238,7 +238,7 @@ class XEarley(_ParserFrontend): return self._parse(start, text) except UnexpectedInput as e: if e._all_terminals is None: - e._all_terminals = self.token_by_name.values() + e._all_terminals = self.terminals_by_name raise e class XEarley_CompleteLex(XEarley): From 8ec6d0f2abce681d9297891e86a68ce5b7acc97d Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Fri, 27 Nov 2020 13:15:34 +0100 Subject: [PATCH 3/4] Correction for PR - `user_repr` is now a method - Fix for python 2.7 - excepts -> expected --- lark-stubs/lexer.pyi | 3 ++ lark/common.py | 5 +++ lark/exceptions.py | 79 +++++++++++++++++++++--------------------- lark/lexer.py | 28 +++++++++------ lark/load_grammar.py | 3 +- lark/parsers/earley.py | 2 +- 6 files changed, 68 insertions(+), 52 deletions(-) diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi index 3f246fb..6b4771a 100644 --- a/lark-stubs/lexer.pyi +++ b/lark-stubs/lexer.pyi @@ -12,6 +12,7 @@ _T = TypeVar('_T') class Pattern(ABC): value: str flags: Collection[str] + raw: str def __init__(self, value: str, flags: Collection[str] = ...): ... @@ -73,6 +74,8 @@ class TerminalDef: def __init__(self, name: str, pattern: Pattern, priority: int = ...): ... + + def user_repr(self) -> str: ... class Token(str): diff --git a/lark/common.py b/lark/common.py index 54b33df..30b92eb 100644 --- a/lark/common.py +++ b/lark/common.py @@ -12,6 +12,8 @@ class LexerConf(Serialize): def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): self.terminals = terminals + self.terminals_by_names = {t.name: t for t in self.terminals} + assert len(self.terminals) == len(self.terminals_by_names) self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} @@ -25,6 +27,9 @@ class LexerConf(Serialize): def tokens(self): warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) return self.terminals + + def _deserialize(self): + self.terminals_by_names = {t.name: t for t in self.terminals} diff --git a/lark/exceptions.py b/lark/exceptions.py index bf6546f..faae832 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,5 +1,6 @@ from .utils import STRING_TYPE, logger + ###{standalone @@ -39,7 +40,7 @@ class UnexpectedInput(LarkError): After catching one of these exceptions, you may call the following helper methods to create a nicer error message. """ pos_in_stream = None - _all_terminals = None + _terminals_by_name = None def get_context(self, text, span=40): """Returns a pretty string pinpointing the error in the text, @@ -96,7 +97,7 @@ class UnexpectedInput(LarkError): if ut.state == self.state: if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % - (self.state, self.accepts, ut.accepts, i, j)) + (self.state, self.accepts, ut.accepts, i, j)) continue try: if ut.token == self.token: # Try exact match first @@ -116,71 +117,69 @@ class UnexpectedInput(LarkError): candidate = label, False return candidate[0] - - def _format_terminals(self, names): - if self._all_terminals: - if isinstance(self._all_terminals, list): - self._all_terminals = {t.name: t for t in self._all_terminals} + + def _format_expected(self, expected): + if self._terminals_by_name: ts = [] - for name in names: - try: - ts.append(self._all_terminals[name].user_repr) - except StopIteration: - # If we don't find the corresponding Terminal (which *should* never happen), don't error. - # Broken __str__ for Exception are some of the worst bugs - ts.append(name) + for ter in expected: + ts.append(self._terminals_by_name[ter].user_repr()) else: - ts = names + ts = expected return "Expected one of: \n\t* %s\n" % '\n\t* '.join(ts) + class UnexpectedEOF(ParseError, UnexpectedInput): - def __init__(self, expected, state=None): + def __init__(self, expected, state=None, terminals_by_name=None): self.expected = expected self.state = state from .lexer import Token - self.token = Token("", "") #, line=-1, column=-1, pos_in_stream=-1) + self.token = Token("", "") # , line=-1, column=-1, pos_in_stream=-1) self.pos_in_stream = -1 self.line = -1 self.column = -1 + self._terminals_by_name = terminals_by_name - message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) - super(UnexpectedEOF, self).__init__(message) + super(UnexpectedEOF, self).__init__() + def __str__(self): + message = "Unexpected end-of-input. " + message += self._format_expected(self.expected) + return message class UnexpectedCharacters(LexError, UnexpectedInput): - def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, _all_terminals=None): + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, + terminals_by_name=None): # TODO considered_tokens and allowed can be figured out using state self.line = line self.column = column self.pos_in_stream = lex_pos self.state = state - self._all_terminals = _all_terminals + self._terminals_by_name = terminals_by_name self.allowed = allowed self.considered_tokens = considered_tokens self.token_history = token_history if isinstance(seq, bytes): - self._s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") + self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace") else: - self._s = seq[lex_pos] + self.char = seq[lex_pos] self._context = self.get_context(seq) - + super(UnexpectedCharacters, self).__init__() def __str__(self): - # Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors - # You will get just `UnexpectedCharacters: ` or something like that - # If you run into this, add an `except Exception as e: print(e); raise e` or similar. - message = "No terminal defined for '%s' at line %d col %d" % (self._s, self.line, self.column) + message = "No terminal defined for '%s' at line %d col %d" % (self.char, self.line, self.column) message += '\n\n' + self._context if self.allowed: - message += self._format_terminals(self.allowed) + message += self._format_expected(self.allowed) if self.token_history: message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) return message +_not_set_marker = object() + class UnexpectedToken(ParseError, UnexpectedInput): """When the parser throws UnexpectedToken, it instantiates a puppet with its internal state. Users can then interactively set the puppet to @@ -188,7 +187,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): see: :ref:`ParserPuppet`. """ - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, all_terminals=None, token_history=None): + + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, terminals_by_name=None, token_history=None): # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') @@ -196,23 +196,24 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.state = state self.token = token - self.expected = expected # XXX deprecate? `accepts` is better + self.expected = expected # XXX deprecate? `accepts` is better + self._accepts = _not_set_marker self.considered_rules = considered_rules self.puppet = puppet - self._all_terminals = all_terminals + self._terminals_by_name = terminals_by_name self.token_history = token_history - super(UnexpectedToken, self).__init__() - + @property def accepts(self): - return self.puppet and self.puppet.accepts() - + if self._accepts is _not_set_marker: + self._accepts = self.puppet and self.puppet.accepts() + return self._accepts + def __str__(self): - # Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors message = ("Unexpected token %r at line %s, column %s.\n%s" - % (self.token, self.line, self.column, self._format_terminals(self.accepts or self.expected))) + % (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected))) if self.token_history: message += "Previous tokens: %r\n" % self.token_history @@ -226,6 +227,7 @@ class VisitError(LarkError): - obj: the tree node or token it was processing when the exception was raised - orig_exc: the exception that cause it to fail """ + def __init__(self, rule, obj, orig_exc): self.obj = obj self.orig_exc = orig_exc @@ -233,5 +235,4 @@ class VisitError(LarkError): message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) - ###} diff --git a/lark/lexer.py b/lark/lexer.py index 43176ac..c089e8a 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -81,18 +81,23 @@ class PatternRE(Pattern): class TerminalDef(Serialize): - __serialize_fields__ = 'name', 'pattern', 'priority', 'user_repr' + __serialize_fields__ = 'name', 'pattern', 'priority' __serialize_namespace__ = PatternStr, PatternRE - def __init__(self, name, pattern, priority=1, user_repr=None): + def __init__(self, name, pattern, priority=1): assert isinstance(pattern, Pattern), pattern self.name = name self.pattern = pattern self.priority = priority - self.user_repr = user_repr or name def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) + + def user_repr(self): + if self.name.startswith('__'): # We represent a generated terminal + return self.pattern.raw or self.name + else: + return self.name class Token(Str): @@ -312,6 +317,7 @@ class TraditionalLexer(Lexer): self.user_callbacks = conf.callbacks self.g_regex_flags = conf.g_regex_flags self.use_bytes = conf.use_bytes + self.terminals_by_names = conf.terminals_by_names self._mres = None @@ -355,7 +361,7 @@ class TraditionalLexer(Lexer): allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], - state=parser_state, _all_terminals=self.terminals) + state=parser_state, terminals_by_name=self.terminals_by_names) value, type_ = res @@ -397,10 +403,7 @@ class ContextualLexer(Lexer): def __init__(self, conf, states, always_accept=()): terminals = list(conf.terminals) - tokens_by_name = {} - for t in terminals: - assert t.name not in tokens_by_name, t - tokens_by_name[t.name] = t + tokens_by_name = conf.terminals_by_names trad_conf = copy(conf) trad_conf.terminals = terminals @@ -437,8 +440,13 @@ class ContextualLexer(Lexer): except UnexpectedCharacters as e: # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. # This tests the input against the global context, to provide a nicer error. - token = self.root_lexer.next_token(lexer_state) - raise UnexpectedToken(token, e.allowed, state=parser_state.position, token_history=[lexer_state.last_token], all_terminals=self.root_lexer.terminals) + last_token = lexer_state.last_token # self.root_lexer.next_token will change this to the wrong token + try: + token = self.root_lexer.next_token(lexer_state, parser_state) + except UnexpectedCharacters: + raise e# Don't raise the exception that the root lexer raise. It has the wrong expected set. + else: + raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_names) class LexerThread: """A thread that ties a lexer instance and a lexer state, to be used by the parser""" diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 9f6bf2e..a07769f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -322,7 +322,6 @@ class PrepareAnonTerminals(Transformer_InPlace): raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) term_name = None - user_repr = p.raw # This will always be ok, independent of what term_name we end up using if isinstance(p, PatternStr): try: @@ -354,7 +353,7 @@ class PrepareAnonTerminals(Transformer_InPlace): if term_name not in self.term_set: assert p not in self.term_reverse self.term_set.add(term_name) - termdef = TerminalDef(term_name, p, user_repr=user_repr) + termdef = TerminalDef(term_name, p) self.term_reverse[p] = termdef self.terminals.append(termdef) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 320b59a..3f537c2 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -302,7 +302,7 @@ class Parser: # this column. Find the item for the start_symbol, which is the root of the SPPF tree. solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: - expected_terminals = [t.expect for t in to_scan] + expected_terminals = [t.expect.name for t in to_scan] raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) if self.debug: From aa7dc19bc343d211d1f5995680cd27d02188f8af Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 29 Nov 2020 13:28:02 +0200 Subject: [PATCH 4/4] Corrections for PR --- lark/common.py | 10 +++++----- lark/exceptions.py | 17 ++++++----------- lark/lexer.py | 18 ++++++++---------- lark/utils.py | 1 + 4 files changed, 20 insertions(+), 26 deletions(-) diff --git a/lark/common.py b/lark/common.py index 30b92eb..467acf8 100644 --- a/lark/common.py +++ b/lark/common.py @@ -12,8 +12,8 @@ class LexerConf(Serialize): def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): self.terminals = terminals - self.terminals_by_names = {t.name: t for t in self.terminals} - assert len(self.terminals) == len(self.terminals_by_names) + self.terminals_by_name = {t.name: t for t in self.terminals} + assert len(self.terminals) == len(self.terminals_by_name) self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} @@ -22,14 +22,14 @@ class LexerConf(Serialize): self.skip_validation = skip_validation self.use_bytes = use_bytes self.lexer_type = None - + @property def tokens(self): warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) return self.terminals - + def _deserialize(self): - self.terminals_by_names = {t.name: t for t in self.terminals} + self.terminals_by_name = {t.name: t for t in self.terminals} diff --git a/lark/exceptions.py b/lark/exceptions.py index faae832..23e78b9 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,4 +1,4 @@ -from .utils import STRING_TYPE, logger +from .utils import STRING_TYPE, logger, NO_VALUE ###{standalone @@ -120,12 +120,8 @@ class UnexpectedInput(LarkError): def _format_expected(self, expected): if self._terminals_by_name: - ts = [] - for ter in expected: - ts.append(self._terminals_by_name[ter].user_repr()) - else: - ts = expected - return "Expected one of: \n\t* %s\n" % '\n\t* '.join(ts) + expected = [self._terminals_by_name[t_name].user_repr() for t_name in expected] + return "Expected one of: \n\t* %s\n" % '\n\t* '.join(expected) class UnexpectedEOF(ParseError, UnexpectedInput): @@ -178,7 +174,6 @@ class UnexpectedCharacters(LexError, UnexpectedInput): message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) return message -_not_set_marker = object() class UnexpectedToken(ParseError, UnexpectedInput): """When the parser throws UnexpectedToken, it instantiates a puppet @@ -197,7 +192,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.token = token self.expected = expected # XXX deprecate? `accepts` is better - self._accepts = _not_set_marker + self._accepts = NO_VALUE self.considered_rules = considered_rules self.puppet = puppet self._terminals_by_name = terminals_by_name @@ -207,8 +202,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): @property def accepts(self): - if self._accepts is _not_set_marker: - self._accepts = self.puppet and self.puppet.accepts() + if self._accepts is NO_VALUE: + self._accepts = self.puppet and self.puppet.accepts() return self._accepts def __str__(self): diff --git a/lark/lexer.py b/lark/lexer.py index c089e8a..114b4ce 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -92,7 +92,7 @@ class TerminalDef(Serialize): def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - + def user_repr(self): if self.name.startswith('__'): # We represent a generated terminal return self.pattern.raw or self.name @@ -317,7 +317,7 @@ class TraditionalLexer(Lexer): self.user_callbacks = conf.callbacks self.g_regex_flags = conf.g_regex_flags self.use_bytes = conf.use_bytes - self.terminals_by_names = conf.terminals_by_names + self.terminals_by_name = conf.terminals_by_name self._mres = None @@ -361,7 +361,7 @@ class TraditionalLexer(Lexer): allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], - state=parser_state, terminals_by_name=self.terminals_by_names) + state=parser_state, terminals_by_name=self.terminals_by_name) value, type_ = res @@ -403,7 +403,7 @@ class ContextualLexer(Lexer): def __init__(self, conf, states, always_accept=()): terminals = list(conf.terminals) - tokens_by_name = conf.terminals_by_names + terminals_by_name = conf.terminals_by_name trad_conf = copy(conf) trad_conf.terminals = terminals @@ -416,9 +416,8 @@ class ContextualLexer(Lexer): lexer = lexer_by_tokens[key] except KeyError: accepts = set(accepts) | set(conf.ignore) | set(always_accept) - state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] lexer_conf = copy(trad_conf) - lexer_conf.terminals = state_tokens + lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] lexer = TraditionalLexer(lexer_conf) lexer_by_tokens[key] = lexer @@ -440,13 +439,12 @@ class ContextualLexer(Lexer): except UnexpectedCharacters as e: # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. # This tests the input against the global context, to provide a nicer error. - last_token = lexer_state.last_token # self.root_lexer.next_token will change this to the wrong token try: + last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token token = self.root_lexer.next_token(lexer_state, parser_state) + raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name) except UnexpectedCharacters: - raise e# Don't raise the exception that the root lexer raise. It has the wrong expected set. - else: - raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_names) + raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. class LexerThread: """A thread that ties a lexer instance and a lexer state, to be used by the parser""" diff --git a/lark/utils.py b/lark/utils.py index 3b5b8a8..642a59f 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -13,6 +13,7 @@ logger.setLevel(logging.CRITICAL) Py36 = (sys.version_info[:2] >= (3, 6)) +NO_VALUE = object() def classify(seq, key=None, value=None): d = {}