diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi index 3f246fb..6b4771a 100644 --- a/lark-stubs/lexer.pyi +++ b/lark-stubs/lexer.pyi @@ -12,6 +12,7 @@ _T = TypeVar('_T') class Pattern(ABC): value: str flags: Collection[str] + raw: str def __init__(self, value: str, flags: Collection[str] = ...): ... @@ -73,6 +74,8 @@ class TerminalDef: def __init__(self, name: str, pattern: Pattern, priority: int = ...): ... + + def user_repr(self) -> str: ... class Token(str): diff --git a/lark/common.py b/lark/common.py index e217063..467acf8 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,3 +1,5 @@ +from warnings import warn + from .utils import Serialize from .lexer import TerminalDef @@ -5,11 +7,13 @@ from .lexer import TerminalDef class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' + __serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): - self.tokens = tokens # TODO should be terminals + def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): + self.terminals = terminals + self.terminals_by_name = {t.name: t for t in self.terminals} + assert len(self.terminals) == len(self.terminals_by_name) self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} @@ -17,9 +21,17 @@ class LexerConf(Serialize): self.re_module = re_module self.skip_validation = skip_validation self.use_bytes = use_bytes - self.lexer_type = None + @property + def tokens(self): + warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) + return self.terminals + + def _deserialize(self): + self.terminals_by_name = {t.name: t for t in self.terminals} + + class ParserConf(Serialize): __serialize_fields__ = 'rules', 'start', 'parser_type' diff --git a/lark/exceptions.py b/lark/exceptions.py index 46740ed..23e78b9 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,4 +1,5 @@ -from .utils import STRING_TYPE, logger +from .utils import STRING_TYPE, logger, NO_VALUE + ###{standalone @@ -39,6 +40,7 @@ class UnexpectedInput(LarkError): After catching one of these exceptions, you may call the following helper methods to create a nicer error message. """ pos_in_stream = None + _terminals_by_name = None def get_context(self, text, span=40): """Returns a pretty string pinpointing the error in the text, @@ -95,7 +97,7 @@ class UnexpectedInput(LarkError): if ut.state == self.state: if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % - (self.state, self.accepts, ut.accepts, i, j)) + (self.state, self.accepts, ut.accepts, i, j)) continue try: if ut.token == self.token: # Try exact match first @@ -116,44 +118,61 @@ class UnexpectedInput(LarkError): return candidate[0] + def _format_expected(self, expected): + if self._terminals_by_name: + expected = [self._terminals_by_name[t_name].user_repr() for t_name in expected] + return "Expected one of: \n\t* %s\n" % '\n\t* '.join(expected) + + class UnexpectedEOF(ParseError, UnexpectedInput): - def __init__(self, expected, state=None): + def __init__(self, expected, state=None, terminals_by_name=None): self.expected = expected self.state = state from .lexer import Token - self.token = Token("", "") #, line=-1, column=-1, pos_in_stream=-1) + self.token = Token("", "") # , line=-1, column=-1, pos_in_stream=-1) self.pos_in_stream = -1 self.line = -1 self.column = -1 + self._terminals_by_name = terminals_by_name - message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) - super(UnexpectedEOF, self).__init__(message) + super(UnexpectedEOF, self).__init__() + + def __str__(self): + message = "Unexpected end-of-input. " + message += self._format_expected(self.expected) + return message class UnexpectedCharacters(LexError, UnexpectedInput): - def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, + terminals_by_name=None): # TODO considered_tokens and allowed can be figured out using state self.line = line self.column = column self.pos_in_stream = lex_pos self.state = state + self._terminals_by_name = terminals_by_name self.allowed = allowed self.considered_tokens = considered_tokens + self.token_history = token_history if isinstance(seq, bytes): - _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") + self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace") else: - _s = seq[lex_pos] + self.char = seq[lex_pos] + self._context = self.get_context(seq) - message = "No terminal defined for %r at line %d col %d" % (_s, line, column) - message += '\n\n' + self.get_context(seq) - if allowed: - message += '\nExpecting: %s\n' % allowed - if token_history: - message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) + super(UnexpectedCharacters, self).__init__() - super(UnexpectedCharacters, self).__init__(message) + def __str__(self): + message = "No terminal defined for '%s' at line %d col %d" % (self.char, self.line, self.column) + message += '\n\n' + self._context + if self.allowed: + message += self._format_expected(self.allowed) + if self.token_history: + message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) + return message class UnexpectedToken(ParseError, UnexpectedInput): @@ -163,7 +182,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): see: :ref:`ParserPuppet`. """ - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None): + + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, terminals_by_name=None, token_history=None): # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') @@ -171,23 +191,28 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.state = state self.token = token - self.expected = expected # XXX deprecate? `accepts` is better + self.expected = expected # XXX deprecate? `accepts` is better + self._accepts = NO_VALUE self.considered_rules = considered_rules self.puppet = puppet + self._terminals_by_name = terminals_by_name self.token_history = token_history - # TODO Only calculate `accepts()` when we need to display it to the user - # This will improve performance when doing automatic error handling - self.accepts = puppet and puppet.accepts() + super(UnexpectedToken, self).__init__() - message = ("Unexpected token %r at line %s, column %s.\n" - "Expected one of: \n\t* %s\n" - % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) + @property + def accepts(self): + if self._accepts is NO_VALUE: + self._accepts = self.puppet and self.puppet.accepts() + return self._accepts + def __str__(self): + message = ("Unexpected token %r at line %s, column %s.\n%s" + % (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected))) if self.token_history: - message += "Previous tokens: %r\n" % token_history + message += "Previous tokens: %r\n" % self.token_history - super(UnexpectedToken, self).__init__(message) + return message class VisitError(LarkError): @@ -197,6 +222,7 @@ class VisitError(LarkError): - obj: the tree node or token it was processing when the exception was raised - orig_exc: the exception that cause it to fail """ + def __init__(self, rule, obj, orig_exc): self.obj = obj self.orig_exc = orig_exc @@ -204,5 +230,4 @@ class VisitError(LarkError): message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) - ###} diff --git a/lark/lark.py b/lark/lark.py index 842df5f..c08eae6 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -416,7 +416,7 @@ class Lark(Serialize): self._callbacks, self.options, # Not all, but multiple attributes are used ) - self.terminals = self.parser.lexer_conf.tokens + self.terminals = self.parser.lexer_conf.terminals self._terminals_dict = {t.name: t for t in self.terminals} return self diff --git a/lark/lexer.py b/lark/lexer.py index 63735e9..114b4ce 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -11,9 +11,10 @@ from copy import copy class Pattern(Serialize): - def __init__(self, value, flags=()): + def __init__(self, value, flags=(), raw=None): self.value = value self.flags = frozenset(flags) + self.raw = raw def __repr__(self): return repr(self.to_regexp()) @@ -92,6 +93,12 @@ class TerminalDef(Serialize): def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) + def user_repr(self): + if self.name.startswith('__'): # We represent a generated terminal + return self.pattern.raw or self.name + else: + return self.name + class Token(Str): """A string with meta-information, that is produced by the lexer. @@ -283,7 +290,7 @@ class Lexer(object): class TraditionalLexer(Lexer): def __init__(self, conf): - terminals = list(conf.tokens) + terminals = list(conf.terminals) assert all(isinstance(t, TerminalDef) for t in terminals), terminals self.re = conf.re_module @@ -310,6 +317,7 @@ class TraditionalLexer(Lexer): self.user_callbacks = conf.callbacks self.g_regex_flags = conf.g_regex_flags self.use_bytes = conf.use_bytes + self.terminals_by_name = conf.terminals_by_name self._mres = None @@ -353,7 +361,7 @@ class TraditionalLexer(Lexer): allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], - state=parser_state) + state=parser_state, terminals_by_name=self.terminals_by_name) value, type_ = res @@ -394,14 +402,11 @@ class LexerState: class ContextualLexer(Lexer): def __init__(self, conf, states, always_accept=()): - terminals = list(conf.tokens) - tokens_by_name = {} - for t in terminals: - assert t.name not in tokens_by_name, t - tokens_by_name[t.name] = t + terminals = list(conf.terminals) + terminals_by_name = conf.terminals_by_name trad_conf = copy(conf) - trad_conf.tokens = terminals + trad_conf.terminals = terminals lexer_by_tokens = {} self.lexers = {} @@ -411,15 +416,14 @@ class ContextualLexer(Lexer): lexer = lexer_by_tokens[key] except KeyError: accepts = set(accepts) | set(conf.ignore) | set(always_accept) - state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] lexer_conf = copy(trad_conf) - lexer_conf.tokens = state_tokens + lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] lexer = TraditionalLexer(lexer_conf) lexer_by_tokens[key] = lexer self.lexers[state] = lexer - assert trad_conf.tokens is terminals + assert trad_conf.terminals is terminals self.root_lexer = TraditionalLexer(trad_conf) def make_lexer_state(self, text): @@ -435,9 +439,12 @@ class ContextualLexer(Lexer): except UnexpectedCharacters as e: # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. # This tests the input against the global context, to provide a nicer error. - token = self.root_lexer.next_token(lexer_state, parser_state) - raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token]) - + try: + last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token + token = self.root_lexer.next_token(lexer_state, parser_state) + raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name) + except UnexpectedCharacters: + raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. class LexerThread: """A thread that ties a lexer instance and a lexer state, to be used by the parser""" diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 76834f4..a07769f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -454,9 +454,9 @@ def _literal_to_pattern(literal): if literal.type == 'STRING': s = s.replace('\\\\', '\\') - return PatternStr(s, flags) + return PatternStr(s, flags, raw=literal.value) elif literal.type == 'REGEXP': - return PatternRE(s, flags) + return PatternRE(s, flags, raw=literal.value) else: assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 5cffdb1..0fab159 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -170,7 +170,7 @@ CYK_FrontEnd = NotImplemented class EarleyRegexpMatcher: def __init__(self, lexer_conf): self.regexps = {} - for t in lexer_conf.tokens: + for t in lexer_conf.terminals: if t.priority != 1: raise GrammarError("Dynamic Earley doesn't support weights on terminals", t, t.priority) regexp = t.pattern.to_regexp() diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 320b59a..3f537c2 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -302,7 +302,7 @@ class Parser: # this column. Find the item for the start_symbol, which is the root of the SPPF tree. solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: - expected_terminals = [t.expect for t in to_scan] + expected_terminals = [t.expect.name for t in to_scan] raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) if self.debug: diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index f7ff8fe..9f08b81 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -36,7 +36,7 @@ class LALR_Parser(Serialize): return self.parser.parse(*args) -class ParseConf: +class ParseConf(object): __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' def __init__(self, parse_table, callbacks, start): @@ -50,7 +50,7 @@ class ParseConf: self.start = start -class ParserState: +class ParserState(object): __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): @@ -124,7 +124,7 @@ class ParserState: if is_end and state_stack[-1] == end_state: return value_stack[-1] -class _Parser: +class _Parser(object): def __init__(self, parse_table, callbacks, debug=False): self.parse_table = parse_table self.callbacks = callbacks diff --git a/lark/utils.py b/lark/utils.py index 3b5b8a8..642a59f 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -13,6 +13,7 @@ logger.setLevel(logging.CRITICAL) Py36 = (sys.version_info[:2] >= (3, 6)) +NO_VALUE = object() def classify(seq, key=None, value=None): d = {}