improve error message with token source.

rename LexerConf.terminals to LexerConf.tokens Make Exception message generation lazy Made a few classes new-style
5 years ago · 605b91e4be
--- a/lark/common.py
+++ b/lark/common.py
@@ -1,14 +1,16 @@
 from warnings import warn

 from .utils import Serialize
 from .lexer import TerminalDef

 ###{standalone

 class LexerConf(Serialize):
    __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes'
    __serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes'
    __serialize_namespace__ = TerminalDef,

    def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
        self.tokens = tokens    # TODO should be terminals
    def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
        self.terminals = terminals
        self.ignore = ignore
        self.postlex = postlex
        self.callbacks = callbacks or {}
@@ -16,6 +18,11 @@ class LexerConf(Serialize):
        self.re_module = re_module
        self.skip_validation = skip_validation
        self.use_bytes = use_bytes
    
    @property
    def tokens(self):
        warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning)
        return self.terminals

 ###}

--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -34,6 +34,7 @@ class UnexpectedInput(LarkError):
    After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
    """
    pos_in_stream = None
    _all_terminals = None

    def get_context(self, text, span=40):
        """Returns a pretty string pinpointing the error in the text,
@@ -109,32 +110,54 @@ class UnexpectedInput(LarkError):
                            candidate = label, False

        return candidate[0]
    
    def _format_terminals(self, names):
        if self._all_terminals:
            t = []
            for name in names:
                try:
                    t.append(next(t.nice_print for t in self._all_terminals if t.name == name))
                except StopIteration:
                    # If we don't find the corresponding Terminal (which *should* never happen), don't error.
                    # Broken __str__ for Exception are some of the worst bugs
                    t.append(t.display_name)
        else:
            t = names
        return "Expected one of: \n\t* %s\n" % '\n\t* '.join(t)



 class UnexpectedCharacters(LexError, UnexpectedInput):
    def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
    def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, _all_terminals=None):
        self.line = line
        self.column = column
        self.pos_in_stream = lex_pos
        self.state = state
        self._all_terminals = _all_terminals

        self.allowed = allowed
        self.considered_tokens = considered_tokens
        self.token_history = token_history

        if isinstance(seq, bytes):
            _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
            self._s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
        else:
            _s = seq[lex_pos]

        message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column)
        message += '\n\n' + self.get_context(seq)
        if allowed:
            message += '\nExpecting: %s\n' % allowed
        if token_history:
            message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history)

        super(UnexpectedCharacters, self).__init__(message)

            self._s = seq[lex_pos]
        self._context = self.get_context(seq)
        
        super(UnexpectedCharacters, self).__init__()

    def __str__(self):
        # Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors
        # You will get just `UnexpectedCharacters: <str() failed>` or something like that
        # If you run into this, add an `except Exception as e: print(e); raise e` or similar.
        message = "No terminal defined for '%s' at line %d col %d" % (self._s, self.line, self.column)
        message += '\n\n' + self._context
        if self.allowed:
            message += self._format_terminals(self.allowed)
        if self.token_history:
            message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history)
        return message

 class UnexpectedToken(ParseError, UnexpectedInput):
    """When the parser throws UnexpectedToken, it instantiates a puppet
@@ -143,7 +166,7 @@ class UnexpectedToken(ParseError, UnexpectedInput):

    see: :ref:`ParserPuppet`.
    """
    def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
    def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, all_terminals=None):
        self.line = getattr(token, 'line', '?')
        self.column = getattr(token, 'column', '?')
        self.pos_in_stream = getattr(token, 'pos_in_stream', None)
@@ -153,16 +176,20 @@ class UnexpectedToken(ParseError, UnexpectedInput):
        self.expected = expected     # XXX deprecate? `accepts` is better
        self.considered_rules = considered_rules
        self.puppet = puppet

        # TODO Only calculate `accepts()` when we need to display it to the user
        # This will improve performance when doing automatic error handling
        self.accepts = puppet and puppet.accepts()

        message = ("Unexpected token %r at line %s, column %s.\n"
                   "Expected one of: \n\t* %s\n"
                   % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))

        super(UnexpectedToken, self).__init__(message)
        self._all_terminals = all_terminals


        super(UnexpectedToken, self).__init__()
    
    @property
    def accepts(self):
        return self.puppet and self.puppet.accepts()
    
    def __str__(self):
        # Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors
        message = ("Unexpected token %r at line %s, column %s.\n%s"
                   % (self.token, self.line, self.column, self._format_terminals(self.accepts or self.expected)))
        return message


 class VisitError(LarkError):
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -402,7 +402,7 @@ class Lark(Serialize):
            self._callbacks,
            self.options, # Not all, but multiple attributes are used
        )
        self.terminals = self.parser.lexer_conf.tokens
        self.terminals = self.parser.lexer_conf.terminals
        self._terminals_dict = {t.name: t for t in self.terminals}
        return self

--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -76,14 +76,15 @@ class PatternRE(Pattern):


 class TerminalDef(Serialize):
    __serialize_fields__ = 'name', 'pattern', 'priority'
    __serialize_fields__ = 'name', 'pattern', 'priority', 'nice_print'
    __serialize_namespace__ = PatternStr, PatternRE

    def __init__(self, name, pattern, priority=1):
    def __init__(self, name, pattern, priority=1, nice_print=None):
        assert isinstance(pattern, Pattern), pattern
        self.name = name
        self.pattern = pattern
        self.priority = priority
        self.nice_print = nice_print or name

    def __repr__(self):
        return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
@@ -278,7 +279,7 @@ class Lexer(object):
 class TraditionalLexer(Lexer):

    def __init__(self, conf):
        terminals = list(conf.tokens)
        terminals = list(conf.terminals)
        assert all(isinstance(t, TerminalDef) for t in terminals), terminals

        self.re = conf.re_module
@@ -347,7 +348,8 @@ class TraditionalLexer(Lexer):
                if not allowed:
                    allowed = {"<END-OF-FILE>"}
                raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
                                           allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token])
                                           allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
                                           _all_terminals=self.terminals)

            value, type_ = res

@@ -386,14 +388,14 @@ class LexerState:
 class ContextualLexer(Lexer):

    def __init__(self, conf, states, always_accept=()):
        terminals = list(conf.tokens)
        terminals = list(conf.terminals)
        tokens_by_name = {}
        for t in terminals:
            assert t.name not in tokens_by_name, t
            tokens_by_name[t.name] = t

        trad_conf = copy(conf)
        trad_conf.tokens = terminals
        trad_conf.terminals = terminals

        lexer_by_tokens = {}
        self.lexers = {}
@@ -405,13 +407,13 @@ class ContextualLexer(Lexer):
                accepts = set(accepts) | set(conf.ignore) | set(always_accept)
                state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
                lexer_conf = copy(trad_conf)
                lexer_conf.tokens = state_tokens
                lexer_conf.terminals = state_tokens
                lexer = TraditionalLexer(lexer_conf)
                lexer_by_tokens[key] = lexer

            self.lexers[state] = lexer

        assert trad_conf.tokens is terminals
        assert trad_conf.terminals is terminals
        self.root_lexer = TraditionalLexer(trad_conf)

    def make_lexer_state(self, text):
@@ -428,7 +430,7 @@ class ContextualLexer(Lexer):
            # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
            # This tests the input against the global context, to provide a nicer error.
            token = self.root_lexer.next_token(lexer_state)
            raise UnexpectedToken(token, e.allowed, state=parser_state.position)
            raise UnexpectedToken(token, e.allowed, state=parser_state.position, all_terminals=self.root_lexer.terminals)

 class LexerThread:
    "A thread that ties a lexer instance and a lexer state, to be used by the parser"
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -5,7 +5,7 @@ import sys
 from copy import copy, deepcopy
 from io import open

 from .utils import bfs, eval_escaping, Py36, logger, classify_bool
 from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isascii
 from .lexer import Token, TerminalDef, PatternStr, PatternRE

 from .parse_tree_builder import ParseTreeBuilder
@@ -317,8 +317,11 @@ class PrepareAnonTerminals(Transformer_InPlace):
            raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)

        term_name = None
        nice_print = None

        if isinstance(p, PatternStr):
            nice_print = repr(value) # This will always be ok, independent of what term_name we end up using
            # TODO: potentially try to get the actual source code, and not the repr
            try:
                # If already defined, use the user-defined terminal name
                term_name = self.term_reverse[p].name
@@ -327,15 +330,14 @@ class PrepareAnonTerminals(Transformer_InPlace):
                try:
                    term_name = _TERMINAL_NAMES[value]
                except KeyError:
                    if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
                        with suppress(UnicodeEncodeError):
                            value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names
                            term_name = value.upper()
                    if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set and isascii(value):
                        term_name = value.upper()

                if term_name in self.term_set:
                    term_name = None

        elif isinstance(p, PatternRE):
            #TODO: generate nice_print
            if p in self.term_reverse: # Kind of a weird placement.name
                term_name = self.term_reverse[p].name
        else:
@@ -348,7 +350,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
        if term_name not in self.term_set:
            assert p not in self.term_reverse
            self.term_set.add(term_name)
            termdef = TerminalDef(term_name, p)
            termdef = TerminalDef(term_name, p, nice_print=nice_print)
            self.term_reverse[p] = termdef
            self.terminals.append(termdef)

--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -6,6 +6,7 @@ from .parsers.lalr_parser import LALR_Parser
 from .grammar import Rule
 from .tree import Tree
 from .common import LexerConf
 from .exceptions import UnexpectedInput
 try:
    import regex
 except ImportError:
@@ -135,7 +136,12 @@ class WithLexer(_ParserFrontend):
        return LexerThread(lexer, text)

    def parse(self, text, start=None):
        return self._parse(start, self.make_lexer(text))
        try:
            return self._parse(start, self.make_lexer(text))
        except UnexpectedInput as e:
            if e._all_terminals is None:
                e._all_terminals = self.lexer_conf.terminals
            raise e

    def init_traditional_lexer(self):
        self.lexer = TraditionalLexer(self.lexer_conf)
@@ -190,7 +196,7 @@ class Earley(WithLexer):

 class XEarley(_ParserFrontend):
    def __init__(self, lexer_conf, parser_conf, options=None, **kw):
        self.token_by_name = {t.name:t for t in lexer_conf.tokens}
        self.token_by_name = {t.name:t for t in lexer_conf.terminals}
        self.start = parser_conf.start

        self._prepare_match(lexer_conf)
@@ -211,7 +217,7 @@ class XEarley(_ParserFrontend):

    def _prepare_match(self, lexer_conf):
        self.regexps = {}
        for t in lexer_conf.tokens:
        for t in lexer_conf.terminals:
            if t.priority != 1:
                raise ValueError("Dynamic Earley doesn't support weights on terminals", t, t.priority)
            regexp = t.pattern.to_regexp()
@@ -228,7 +234,12 @@ class XEarley(_ParserFrontend):
            self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)

    def parse(self, text, start):
        return self._parse(start, text)
        try:
            return self._parse(start, text)
        except UnexpectedInput as e:
            if e._all_terminals is None:
                e._all_terminals = self.token_by_name.values()
            raise e

 class XEarley_CompleteLex(XEarley):
    def __init__(self, *args, **kw):
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -35,7 +35,7 @@ class LALR_Parser(object):
        return self.parser.parse(*args)


 class ParseConf:
 class ParseConf(object):
    __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'

    def __init__(self, parse_table, callbacks, start):
@@ -49,7 +49,7 @@ class ParseConf:
        self.start = start


 class ParserState:
 class ParserState(object):
    __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'

    def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None):
@@ -117,7 +117,7 @@ class ParserState:
                if is_end and state_stack[-1] == end_state:
                    return value_stack[-1]

 class _Parser:
 class _Parser(object):
    def __init__(self, parse_table, callbacks, debug=False):
        self.parse_table = parse_table
        self.callbacks = callbacks
--- a/lark/parsers/lalr_puppet.py
+++ b/lark/parsers/lalr_puppet.py
@@ -60,7 +60,7 @@ class ParserPuppet(object):

        Updated by ``feed_token()``.
        """
        return self.parser_state.parse_table.states[self.parser_state.position]
        return self.parser_state.parse_conf.parse_table.states[self.parser_state.position]

    def accepts(self):
        accepts = set()