diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi index 3f246fb..6b4771a 100644 --- a/lark-stubs/lexer.pyi +++ b/lark-stubs/lexer.pyi @@ -12,6 +12,7 @@ _T = TypeVar('_T') class Pattern(ABC): value: str flags: Collection[str] + raw: str def __init__(self, value: str, flags: Collection[str] = ...): ... @@ -73,6 +74,8 @@ class TerminalDef: def __init__(self, name: str, pattern: Pattern, priority: int = ...): ... + + def user_repr(self) -> str: ... class Token(str): diff --git a/lark/common.py b/lark/common.py index 54b33df..30b92eb 100644 --- a/lark/common.py +++ b/lark/common.py @@ -12,6 +12,8 @@ class LexerConf(Serialize): def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): self.terminals = terminals + self.terminals_by_names = {t.name: t for t in self.terminals} + assert len(self.terminals) == len(self.terminals_by_names) self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} @@ -25,6 +27,9 @@ class LexerConf(Serialize): def tokens(self): warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) return self.terminals + + def _deserialize(self): + self.terminals_by_names = {t.name: t for t in self.terminals} diff --git a/lark/exceptions.py b/lark/exceptions.py index bf6546f..faae832 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,5 +1,6 @@ from .utils import STRING_TYPE, logger + ###{standalone @@ -39,7 +40,7 @@ class UnexpectedInput(LarkError): After catching one of these exceptions, you may call the following helper methods to create a nicer error message. """ pos_in_stream = None - _all_terminals = None + _terminals_by_name = None def get_context(self, text, span=40): """Returns a pretty string pinpointing the error in the text, @@ -96,7 +97,7 @@ class UnexpectedInput(LarkError): if ut.state == self.state: if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % - (self.state, self.accepts, ut.accepts, i, j)) + (self.state, self.accepts, ut.accepts, i, j)) continue try: if ut.token == self.token: # Try exact match first @@ -116,71 +117,69 @@ class UnexpectedInput(LarkError): candidate = label, False return candidate[0] - - def _format_terminals(self, names): - if self._all_terminals: - if isinstance(self._all_terminals, list): - self._all_terminals = {t.name: t for t in self._all_terminals} + + def _format_expected(self, expected): + if self._terminals_by_name: ts = [] - for name in names: - try: - ts.append(self._all_terminals[name].user_repr) - except StopIteration: - # If we don't find the corresponding Terminal (which *should* never happen), don't error. - # Broken __str__ for Exception are some of the worst bugs - ts.append(name) + for ter in expected: + ts.append(self._terminals_by_name[ter].user_repr()) else: - ts = names + ts = expected return "Expected one of: \n\t* %s\n" % '\n\t* '.join(ts) + class UnexpectedEOF(ParseError, UnexpectedInput): - def __init__(self, expected, state=None): + def __init__(self, expected, state=None, terminals_by_name=None): self.expected = expected self.state = state from .lexer import Token - self.token = Token("", "") #, line=-1, column=-1, pos_in_stream=-1) + self.token = Token("", "") # , line=-1, column=-1, pos_in_stream=-1) self.pos_in_stream = -1 self.line = -1 self.column = -1 + self._terminals_by_name = terminals_by_name - message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) - super(UnexpectedEOF, self).__init__(message) + super(UnexpectedEOF, self).__init__() + def __str__(self): + message = "Unexpected end-of-input. " + message += self._format_expected(self.expected) + return message class UnexpectedCharacters(LexError, UnexpectedInput): - def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, _all_terminals=None): + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, + terminals_by_name=None): # TODO considered_tokens and allowed can be figured out using state self.line = line self.column = column self.pos_in_stream = lex_pos self.state = state - self._all_terminals = _all_terminals + self._terminals_by_name = terminals_by_name self.allowed = allowed self.considered_tokens = considered_tokens self.token_history = token_history if isinstance(seq, bytes): - self._s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") + self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace") else: - self._s = seq[lex_pos] + self.char = seq[lex_pos] self._context = self.get_context(seq) - + super(UnexpectedCharacters, self).__init__() def __str__(self): - # Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors - # You will get just `UnexpectedCharacters: ` or something like that - # If you run into this, add an `except Exception as e: print(e); raise e` or similar. - message = "No terminal defined for '%s' at line %d col %d" % (self._s, self.line, self.column) + message = "No terminal defined for '%s' at line %d col %d" % (self.char, self.line, self.column) message += '\n\n' + self._context if self.allowed: - message += self._format_terminals(self.allowed) + message += self._format_expected(self.allowed) if self.token_history: message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) return message +_not_set_marker = object() + class UnexpectedToken(ParseError, UnexpectedInput): """When the parser throws UnexpectedToken, it instantiates a puppet with its internal state. Users can then interactively set the puppet to @@ -188,7 +187,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): see: :ref:`ParserPuppet`. """ - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, all_terminals=None, token_history=None): + + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, terminals_by_name=None, token_history=None): # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') @@ -196,23 +196,24 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.state = state self.token = token - self.expected = expected # XXX deprecate? `accepts` is better + self.expected = expected # XXX deprecate? `accepts` is better + self._accepts = _not_set_marker self.considered_rules = considered_rules self.puppet = puppet - self._all_terminals = all_terminals + self._terminals_by_name = terminals_by_name self.token_history = token_history - super(UnexpectedToken, self).__init__() - + @property def accepts(self): - return self.puppet and self.puppet.accepts() - + if self._accepts is _not_set_marker: + self._accepts = self.puppet and self.puppet.accepts() + return self._accepts + def __str__(self): - # Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors message = ("Unexpected token %r at line %s, column %s.\n%s" - % (self.token, self.line, self.column, self._format_terminals(self.accepts or self.expected))) + % (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected))) if self.token_history: message += "Previous tokens: %r\n" % self.token_history @@ -226,6 +227,7 @@ class VisitError(LarkError): - obj: the tree node or token it was processing when the exception was raised - orig_exc: the exception that cause it to fail """ + def __init__(self, rule, obj, orig_exc): self.obj = obj self.orig_exc = orig_exc @@ -233,5 +235,4 @@ class VisitError(LarkError): message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) - ###} diff --git a/lark/lexer.py b/lark/lexer.py index 43176ac..c089e8a 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -81,18 +81,23 @@ class PatternRE(Pattern): class TerminalDef(Serialize): - __serialize_fields__ = 'name', 'pattern', 'priority', 'user_repr' + __serialize_fields__ = 'name', 'pattern', 'priority' __serialize_namespace__ = PatternStr, PatternRE - def __init__(self, name, pattern, priority=1, user_repr=None): + def __init__(self, name, pattern, priority=1): assert isinstance(pattern, Pattern), pattern self.name = name self.pattern = pattern self.priority = priority - self.user_repr = user_repr or name def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) + + def user_repr(self): + if self.name.startswith('__'): # We represent a generated terminal + return self.pattern.raw or self.name + else: + return self.name class Token(Str): @@ -312,6 +317,7 @@ class TraditionalLexer(Lexer): self.user_callbacks = conf.callbacks self.g_regex_flags = conf.g_regex_flags self.use_bytes = conf.use_bytes + self.terminals_by_names = conf.terminals_by_names self._mres = None @@ -355,7 +361,7 @@ class TraditionalLexer(Lexer): allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], - state=parser_state, _all_terminals=self.terminals) + state=parser_state, terminals_by_name=self.terminals_by_names) value, type_ = res @@ -397,10 +403,7 @@ class ContextualLexer(Lexer): def __init__(self, conf, states, always_accept=()): terminals = list(conf.terminals) - tokens_by_name = {} - for t in terminals: - assert t.name not in tokens_by_name, t - tokens_by_name[t.name] = t + tokens_by_name = conf.terminals_by_names trad_conf = copy(conf) trad_conf.terminals = terminals @@ -437,8 +440,13 @@ class ContextualLexer(Lexer): except UnexpectedCharacters as e: # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. # This tests the input against the global context, to provide a nicer error. - token = self.root_lexer.next_token(lexer_state) - raise UnexpectedToken(token, e.allowed, state=parser_state.position, token_history=[lexer_state.last_token], all_terminals=self.root_lexer.terminals) + last_token = lexer_state.last_token # self.root_lexer.next_token will change this to the wrong token + try: + token = self.root_lexer.next_token(lexer_state, parser_state) + except UnexpectedCharacters: + raise e# Don't raise the exception that the root lexer raise. It has the wrong expected set. + else: + raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_names) class LexerThread: """A thread that ties a lexer instance and a lexer state, to be used by the parser""" diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 9f6bf2e..a07769f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -322,7 +322,6 @@ class PrepareAnonTerminals(Transformer_InPlace): raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) term_name = None - user_repr = p.raw # This will always be ok, independent of what term_name we end up using if isinstance(p, PatternStr): try: @@ -354,7 +353,7 @@ class PrepareAnonTerminals(Transformer_InPlace): if term_name not in self.term_set: assert p not in self.term_reverse self.term_set.add(term_name) - termdef = TerminalDef(term_name, p, user_repr=user_repr) + termdef = TerminalDef(term_name, p) self.term_reverse[p] = termdef self.terminals.append(termdef) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 320b59a..3f537c2 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -302,7 +302,7 @@ class Parser: # this column. Find the item for the start_symbol, which is the root of the SPPF tree. solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: - expected_terminals = [t.expect for t in to_scan] + expected_terminals = [t.expect.name for t in to_scan] raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) if self.debug: