rename LexerConf.terminals to LexerConf.tokens Make Exception message generation lazy Made a few classes new-styletags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
@@ -1,14 +1,16 @@ | |||
from warnings import warn | |||
from .utils import Serialize | |||
from .lexer import TerminalDef | |||
###{standalone | |||
class LexerConf(Serialize): | |||
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' | |||
__serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes' | |||
__serialize_namespace__ = TerminalDef, | |||
def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||
self.tokens = tokens # TODO should be terminals | |||
def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||
self.terminals = terminals | |||
self.ignore = ignore | |||
self.postlex = postlex | |||
self.callbacks = callbacks or {} | |||
@@ -16,6 +18,11 @@ class LexerConf(Serialize): | |||
self.re_module = re_module | |||
self.skip_validation = skip_validation | |||
self.use_bytes = use_bytes | |||
@property | |||
def tokens(self): | |||
warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) | |||
return self.terminals | |||
###} | |||
@@ -34,6 +34,7 @@ class UnexpectedInput(LarkError): | |||
After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | |||
""" | |||
pos_in_stream = None | |||
_all_terminals = None | |||
def get_context(self, text, span=40): | |||
"""Returns a pretty string pinpointing the error in the text, | |||
@@ -109,32 +110,54 @@ class UnexpectedInput(LarkError): | |||
candidate = label, False | |||
return candidate[0] | |||
def _format_terminals(self, names): | |||
if self._all_terminals: | |||
t = [] | |||
for name in names: | |||
try: | |||
t.append(next(t.nice_print for t in self._all_terminals if t.name == name)) | |||
except StopIteration: | |||
# If we don't find the corresponding Terminal (which *should* never happen), don't error. | |||
# Broken __str__ for Exception are some of the worst bugs | |||
t.append(t.display_name) | |||
else: | |||
t = names | |||
return "Expected one of: \n\t* %s\n" % '\n\t* '.join(t) | |||
class UnexpectedCharacters(LexError, UnexpectedInput): | |||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, _all_terminals=None): | |||
self.line = line | |||
self.column = column | |||
self.pos_in_stream = lex_pos | |||
self.state = state | |||
self._all_terminals = _all_terminals | |||
self.allowed = allowed | |||
self.considered_tokens = considered_tokens | |||
self.token_history = token_history | |||
if isinstance(seq, bytes): | |||
_s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||
self._s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||
else: | |||
_s = seq[lex_pos] | |||
message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) | |||
message += '\n\n' + self.get_context(seq) | |||
if allowed: | |||
message += '\nExpecting: %s\n' % allowed | |||
if token_history: | |||
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) | |||
super(UnexpectedCharacters, self).__init__(message) | |||
self._s = seq[lex_pos] | |||
self._context = self.get_context(seq) | |||
super(UnexpectedCharacters, self).__init__() | |||
def __str__(self): | |||
# Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors | |||
# You will get just `UnexpectedCharacters: <str() failed>` or something like that | |||
# If you run into this, add an `except Exception as e: print(e); raise e` or similar. | |||
message = "No terminal defined for '%s' at line %d col %d" % (self._s, self.line, self.column) | |||
message += '\n\n' + self._context | |||
if self.allowed: | |||
message += self._format_terminals(self.allowed) | |||
if self.token_history: | |||
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) | |||
return message | |||
class UnexpectedToken(ParseError, UnexpectedInput): | |||
"""When the parser throws UnexpectedToken, it instantiates a puppet | |||
@@ -143,7 +166,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
see: :ref:`ParserPuppet`. | |||
""" | |||
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | |||
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, all_terminals=None): | |||
self.line = getattr(token, 'line', '?') | |||
self.column = getattr(token, 'column', '?') | |||
self.pos_in_stream = getattr(token, 'pos_in_stream', None) | |||
@@ -153,16 +176,20 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
self.expected = expected # XXX deprecate? `accepts` is better | |||
self.considered_rules = considered_rules | |||
self.puppet = puppet | |||
# TODO Only calculate `accepts()` when we need to display it to the user | |||
# This will improve performance when doing automatic error handling | |||
self.accepts = puppet and puppet.accepts() | |||
message = ("Unexpected token %r at line %s, column %s.\n" | |||
"Expected one of: \n\t* %s\n" | |||
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||
super(UnexpectedToken, self).__init__(message) | |||
self._all_terminals = all_terminals | |||
super(UnexpectedToken, self).__init__() | |||
@property | |||
def accepts(self): | |||
return self.puppet and self.puppet.accepts() | |||
def __str__(self): | |||
# Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors | |||
message = ("Unexpected token %r at line %s, column %s.\n%s" | |||
% (self.token, self.line, self.column, self._format_terminals(self.accepts or self.expected))) | |||
return message | |||
class VisitError(LarkError): | |||
@@ -402,7 +402,7 @@ class Lark(Serialize): | |||
self._callbacks, | |||
self.options, # Not all, but multiple attributes are used | |||
) | |||
self.terminals = self.parser.lexer_conf.tokens | |||
self.terminals = self.parser.lexer_conf.terminals | |||
self._terminals_dict = {t.name: t for t in self.terminals} | |||
return self | |||
@@ -76,14 +76,15 @@ class PatternRE(Pattern): | |||
class TerminalDef(Serialize): | |||
__serialize_fields__ = 'name', 'pattern', 'priority' | |||
__serialize_fields__ = 'name', 'pattern', 'priority', 'nice_print' | |||
__serialize_namespace__ = PatternStr, PatternRE | |||
def __init__(self, name, pattern, priority=1): | |||
def __init__(self, name, pattern, priority=1, nice_print=None): | |||
assert isinstance(pattern, Pattern), pattern | |||
self.name = name | |||
self.pattern = pattern | |||
self.priority = priority | |||
self.nice_print = nice_print or name | |||
def __repr__(self): | |||
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | |||
@@ -278,7 +279,7 @@ class Lexer(object): | |||
class TraditionalLexer(Lexer): | |||
def __init__(self, conf): | |||
terminals = list(conf.tokens) | |||
terminals = list(conf.terminals) | |||
assert all(isinstance(t, TerminalDef) for t in terminals), terminals | |||
self.re = conf.re_module | |||
@@ -347,7 +348,8 @@ class TraditionalLexer(Lexer): | |||
if not allowed: | |||
allowed = {"<END-OF-FILE>"} | |||
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | |||
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) | |||
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], | |||
_all_terminals=self.terminals) | |||
value, type_ = res | |||
@@ -386,14 +388,14 @@ class LexerState: | |||
class ContextualLexer(Lexer): | |||
def __init__(self, conf, states, always_accept=()): | |||
terminals = list(conf.tokens) | |||
terminals = list(conf.terminals) | |||
tokens_by_name = {} | |||
for t in terminals: | |||
assert t.name not in tokens_by_name, t | |||
tokens_by_name[t.name] = t | |||
trad_conf = copy(conf) | |||
trad_conf.tokens = terminals | |||
trad_conf.terminals = terminals | |||
lexer_by_tokens = {} | |||
self.lexers = {} | |||
@@ -405,13 +407,13 @@ class ContextualLexer(Lexer): | |||
accepts = set(accepts) | set(conf.ignore) | set(always_accept) | |||
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | |||
lexer_conf = copy(trad_conf) | |||
lexer_conf.tokens = state_tokens | |||
lexer_conf.terminals = state_tokens | |||
lexer = TraditionalLexer(lexer_conf) | |||
lexer_by_tokens[key] = lexer | |||
self.lexers[state] = lexer | |||
assert trad_conf.tokens is terminals | |||
assert trad_conf.terminals is terminals | |||
self.root_lexer = TraditionalLexer(trad_conf) | |||
def make_lexer_state(self, text): | |||
@@ -428,7 +430,7 @@ class ContextualLexer(Lexer): | |||
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | |||
# This tests the input against the global context, to provide a nicer error. | |||
token = self.root_lexer.next_token(lexer_state) | |||
raise UnexpectedToken(token, e.allowed, state=parser_state.position) | |||
raise UnexpectedToken(token, e.allowed, state=parser_state.position, all_terminals=self.root_lexer.terminals) | |||
class LexerThread: | |||
"A thread that ties a lexer instance and a lexer state, to be used by the parser" | |||
@@ -5,7 +5,7 @@ import sys | |||
from copy import copy, deepcopy | |||
from io import open | |||
from .utils import bfs, eval_escaping, Py36, logger, classify_bool | |||
from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isascii | |||
from .lexer import Token, TerminalDef, PatternStr, PatternRE | |||
from .parse_tree_builder import ParseTreeBuilder | |||
@@ -317,8 +317,11 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) | |||
term_name = None | |||
nice_print = None | |||
if isinstance(p, PatternStr): | |||
nice_print = repr(value) # This will always be ok, independent of what term_name we end up using | |||
# TODO: potentially try to get the actual source code, and not the repr | |||
try: | |||
# If already defined, use the user-defined terminal name | |||
term_name = self.term_reverse[p].name | |||
@@ -327,15 +330,14 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
try: | |||
term_name = _TERMINAL_NAMES[value] | |||
except KeyError: | |||
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set: | |||
with suppress(UnicodeEncodeError): | |||
value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names | |||
term_name = value.upper() | |||
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set and isascii(value): | |||
term_name = value.upper() | |||
if term_name in self.term_set: | |||
term_name = None | |||
elif isinstance(p, PatternRE): | |||
#TODO: generate nice_print | |||
if p in self.term_reverse: # Kind of a weird placement.name | |||
term_name = self.term_reverse[p].name | |||
else: | |||
@@ -348,7 +350,7 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
if term_name not in self.term_set: | |||
assert p not in self.term_reverse | |||
self.term_set.add(term_name) | |||
termdef = TerminalDef(term_name, p) | |||
termdef = TerminalDef(term_name, p, nice_print=nice_print) | |||
self.term_reverse[p] = termdef | |||
self.terminals.append(termdef) | |||
@@ -6,6 +6,7 @@ from .parsers.lalr_parser import LALR_Parser | |||
from .grammar import Rule | |||
from .tree import Tree | |||
from .common import LexerConf | |||
from .exceptions import UnexpectedInput | |||
try: | |||
import regex | |||
except ImportError: | |||
@@ -135,7 +136,12 @@ class WithLexer(_ParserFrontend): | |||
return LexerThread(lexer, text) | |||
def parse(self, text, start=None): | |||
return self._parse(start, self.make_lexer(text)) | |||
try: | |||
return self._parse(start, self.make_lexer(text)) | |||
except UnexpectedInput as e: | |||
if e._all_terminals is None: | |||
e._all_terminals = self.lexer_conf.terminals | |||
raise e | |||
def init_traditional_lexer(self): | |||
self.lexer = TraditionalLexer(self.lexer_conf) | |||
@@ -190,7 +196,7 @@ class Earley(WithLexer): | |||
class XEarley(_ParserFrontend): | |||
def __init__(self, lexer_conf, parser_conf, options=None, **kw): | |||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
self.token_by_name = {t.name:t for t in lexer_conf.terminals} | |||
self.start = parser_conf.start | |||
self._prepare_match(lexer_conf) | |||
@@ -211,7 +217,7 @@ class XEarley(_ParserFrontend): | |||
def _prepare_match(self, lexer_conf): | |||
self.regexps = {} | |||
for t in lexer_conf.tokens: | |||
for t in lexer_conf.terminals: | |||
if t.priority != 1: | |||
raise ValueError("Dynamic Earley doesn't support weights on terminals", t, t.priority) | |||
regexp = t.pattern.to_regexp() | |||
@@ -228,7 +234,12 @@ class XEarley(_ParserFrontend): | |||
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | |||
def parse(self, text, start): | |||
return self._parse(start, text) | |||
try: | |||
return self._parse(start, text) | |||
except UnexpectedInput as e: | |||
if e._all_terminals is None: | |||
e._all_terminals = self.token_by_name.values() | |||
raise e | |||
class XEarley_CompleteLex(XEarley): | |||
def __init__(self, *args, **kw): | |||
@@ -35,7 +35,7 @@ class LALR_Parser(object): | |||
return self.parser.parse(*args) | |||
class ParseConf: | |||
class ParseConf(object): | |||
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | |||
def __init__(self, parse_table, callbacks, start): | |||
@@ -49,7 +49,7 @@ class ParseConf: | |||
self.start = start | |||
class ParserState: | |||
class ParserState(object): | |||
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | |||
def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | |||
@@ -117,7 +117,7 @@ class ParserState: | |||
if is_end and state_stack[-1] == end_state: | |||
return value_stack[-1] | |||
class _Parser: | |||
class _Parser(object): | |||
def __init__(self, parse_table, callbacks, debug=False): | |||
self.parse_table = parse_table | |||
self.callbacks = callbacks | |||
@@ -60,7 +60,7 @@ class ParserPuppet(object): | |||
Updated by ``feed_token()``. | |||
""" | |||
return self.parser_state.parse_table.states[self.parser_state.position] | |||
return self.parser_state.parse_conf.parse_table.states[self.parser_state.position] | |||
def accepts(self): | |||
accepts = set() | |||