rename LexerConf.terminals to LexerConf.tokens Make Exception message generation lazy Made a few classes new-styletags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
@@ -1,14 +1,16 @@ | |||||
from warnings import warn | |||||
from .utils import Serialize | from .utils import Serialize | ||||
from .lexer import TerminalDef | from .lexer import TerminalDef | ||||
###{standalone | ###{standalone | ||||
class LexerConf(Serialize): | class LexerConf(Serialize): | ||||
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' | |||||
__serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes' | |||||
__serialize_namespace__ = TerminalDef, | __serialize_namespace__ = TerminalDef, | ||||
def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||||
self.tokens = tokens # TODO should be terminals | |||||
def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||||
self.terminals = terminals | |||||
self.ignore = ignore | self.ignore = ignore | ||||
self.postlex = postlex | self.postlex = postlex | ||||
self.callbacks = callbacks or {} | self.callbacks = callbacks or {} | ||||
@@ -16,6 +18,11 @@ class LexerConf(Serialize): | |||||
self.re_module = re_module | self.re_module = re_module | ||||
self.skip_validation = skip_validation | self.skip_validation = skip_validation | ||||
self.use_bytes = use_bytes | self.use_bytes = use_bytes | ||||
@property | |||||
def tokens(self): | |||||
warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) | |||||
return self.terminals | |||||
###} | ###} | ||||
@@ -34,6 +34,7 @@ class UnexpectedInput(LarkError): | |||||
After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | ||||
""" | """ | ||||
pos_in_stream = None | pos_in_stream = None | ||||
_all_terminals = None | |||||
def get_context(self, text, span=40): | def get_context(self, text, span=40): | ||||
"""Returns a pretty string pinpointing the error in the text, | """Returns a pretty string pinpointing the error in the text, | ||||
@@ -109,32 +110,54 @@ class UnexpectedInput(LarkError): | |||||
candidate = label, False | candidate = label, False | ||||
return candidate[0] | return candidate[0] | ||||
def _format_terminals(self, names): | |||||
if self._all_terminals: | |||||
t = [] | |||||
for name in names: | |||||
try: | |||||
t.append(next(t.nice_print for t in self._all_terminals if t.name == name)) | |||||
except StopIteration: | |||||
# If we don't find the corresponding Terminal (which *should* never happen), don't error. | |||||
# Broken __str__ for Exception are some of the worst bugs | |||||
t.append(t.display_name) | |||||
else: | |||||
t = names | |||||
return "Expected one of: \n\t* %s\n" % '\n\t* '.join(t) | |||||
class UnexpectedCharacters(LexError, UnexpectedInput): | class UnexpectedCharacters(LexError, UnexpectedInput): | ||||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, _all_terminals=None): | |||||
self.line = line | self.line = line | ||||
self.column = column | self.column = column | ||||
self.pos_in_stream = lex_pos | self.pos_in_stream = lex_pos | ||||
self.state = state | self.state = state | ||||
self._all_terminals = _all_terminals | |||||
self.allowed = allowed | self.allowed = allowed | ||||
self.considered_tokens = considered_tokens | self.considered_tokens = considered_tokens | ||||
self.token_history = token_history | |||||
if isinstance(seq, bytes): | if isinstance(seq, bytes): | ||||
_s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||||
self._s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||||
else: | else: | ||||
_s = seq[lex_pos] | |||||
message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) | |||||
message += '\n\n' + self.get_context(seq) | |||||
if allowed: | |||||
message += '\nExpecting: %s\n' % allowed | |||||
if token_history: | |||||
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) | |||||
super(UnexpectedCharacters, self).__init__(message) | |||||
self._s = seq[lex_pos] | |||||
self._context = self.get_context(seq) | |||||
super(UnexpectedCharacters, self).__init__() | |||||
def __str__(self): | |||||
# Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors | |||||
# You will get just `UnexpectedCharacters: <str() failed>` or something like that | |||||
# If you run into this, add an `except Exception as e: print(e); raise e` or similar. | |||||
message = "No terminal defined for '%s' at line %d col %d" % (self._s, self.line, self.column) | |||||
message += '\n\n' + self._context | |||||
if self.allowed: | |||||
message += self._format_terminals(self.allowed) | |||||
if self.token_history: | |||||
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) | |||||
return message | |||||
class UnexpectedToken(ParseError, UnexpectedInput): | class UnexpectedToken(ParseError, UnexpectedInput): | ||||
"""When the parser throws UnexpectedToken, it instantiates a puppet | """When the parser throws UnexpectedToken, it instantiates a puppet | ||||
@@ -143,7 +166,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||||
see: :ref:`ParserPuppet`. | see: :ref:`ParserPuppet`. | ||||
""" | """ | ||||
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | |||||
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, all_terminals=None): | |||||
self.line = getattr(token, 'line', '?') | self.line = getattr(token, 'line', '?') | ||||
self.column = getattr(token, 'column', '?') | self.column = getattr(token, 'column', '?') | ||||
self.pos_in_stream = getattr(token, 'pos_in_stream', None) | self.pos_in_stream = getattr(token, 'pos_in_stream', None) | ||||
@@ -153,16 +176,20 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||||
self.expected = expected # XXX deprecate? `accepts` is better | self.expected = expected # XXX deprecate? `accepts` is better | ||||
self.considered_rules = considered_rules | self.considered_rules = considered_rules | ||||
self.puppet = puppet | self.puppet = puppet | ||||
# TODO Only calculate `accepts()` when we need to display it to the user | |||||
# This will improve performance when doing automatic error handling | |||||
self.accepts = puppet and puppet.accepts() | |||||
message = ("Unexpected token %r at line %s, column %s.\n" | |||||
"Expected one of: \n\t* %s\n" | |||||
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||||
super(UnexpectedToken, self).__init__(message) | |||||
self._all_terminals = all_terminals | |||||
super(UnexpectedToken, self).__init__() | |||||
@property | |||||
def accepts(self): | |||||
return self.puppet and self.puppet.accepts() | |||||
def __str__(self): | |||||
# Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors | |||||
message = ("Unexpected token %r at line %s, column %s.\n%s" | |||||
% (self.token, self.line, self.column, self._format_terminals(self.accepts or self.expected))) | |||||
return message | |||||
class VisitError(LarkError): | class VisitError(LarkError): | ||||
@@ -402,7 +402,7 @@ class Lark(Serialize): | |||||
self._callbacks, | self._callbacks, | ||||
self.options, # Not all, but multiple attributes are used | self.options, # Not all, but multiple attributes are used | ||||
) | ) | ||||
self.terminals = self.parser.lexer_conf.tokens | |||||
self.terminals = self.parser.lexer_conf.terminals | |||||
self._terminals_dict = {t.name: t for t in self.terminals} | self._terminals_dict = {t.name: t for t in self.terminals} | ||||
return self | return self | ||||
@@ -76,14 +76,15 @@ class PatternRE(Pattern): | |||||
class TerminalDef(Serialize): | class TerminalDef(Serialize): | ||||
__serialize_fields__ = 'name', 'pattern', 'priority' | |||||
__serialize_fields__ = 'name', 'pattern', 'priority', 'nice_print' | |||||
__serialize_namespace__ = PatternStr, PatternRE | __serialize_namespace__ = PatternStr, PatternRE | ||||
def __init__(self, name, pattern, priority=1): | |||||
def __init__(self, name, pattern, priority=1, nice_print=None): | |||||
assert isinstance(pattern, Pattern), pattern | assert isinstance(pattern, Pattern), pattern | ||||
self.name = name | self.name = name | ||||
self.pattern = pattern | self.pattern = pattern | ||||
self.priority = priority | self.priority = priority | ||||
self.nice_print = nice_print or name | |||||
def __repr__(self): | def __repr__(self): | ||||
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | ||||
@@ -278,7 +279,7 @@ class Lexer(object): | |||||
class TraditionalLexer(Lexer): | class TraditionalLexer(Lexer): | ||||
def __init__(self, conf): | def __init__(self, conf): | ||||
terminals = list(conf.tokens) | |||||
terminals = list(conf.terminals) | |||||
assert all(isinstance(t, TerminalDef) for t in terminals), terminals | assert all(isinstance(t, TerminalDef) for t in terminals), terminals | ||||
self.re = conf.re_module | self.re = conf.re_module | ||||
@@ -347,7 +348,8 @@ class TraditionalLexer(Lexer): | |||||
if not allowed: | if not allowed: | ||||
allowed = {"<END-OF-FILE>"} | allowed = {"<END-OF-FILE>"} | ||||
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | ||||
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) | |||||
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], | |||||
_all_terminals=self.terminals) | |||||
value, type_ = res | value, type_ = res | ||||
@@ -386,14 +388,14 @@ class LexerState: | |||||
class ContextualLexer(Lexer): | class ContextualLexer(Lexer): | ||||
def __init__(self, conf, states, always_accept=()): | def __init__(self, conf, states, always_accept=()): | ||||
terminals = list(conf.tokens) | |||||
terminals = list(conf.terminals) | |||||
tokens_by_name = {} | tokens_by_name = {} | ||||
for t in terminals: | for t in terminals: | ||||
assert t.name not in tokens_by_name, t | assert t.name not in tokens_by_name, t | ||||
tokens_by_name[t.name] = t | tokens_by_name[t.name] = t | ||||
trad_conf = copy(conf) | trad_conf = copy(conf) | ||||
trad_conf.tokens = terminals | |||||
trad_conf.terminals = terminals | |||||
lexer_by_tokens = {} | lexer_by_tokens = {} | ||||
self.lexers = {} | self.lexers = {} | ||||
@@ -405,13 +407,13 @@ class ContextualLexer(Lexer): | |||||
accepts = set(accepts) | set(conf.ignore) | set(always_accept) | accepts = set(accepts) | set(conf.ignore) | set(always_accept) | ||||
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | ||||
lexer_conf = copy(trad_conf) | lexer_conf = copy(trad_conf) | ||||
lexer_conf.tokens = state_tokens | |||||
lexer_conf.terminals = state_tokens | |||||
lexer = TraditionalLexer(lexer_conf) | lexer = TraditionalLexer(lexer_conf) | ||||
lexer_by_tokens[key] = lexer | lexer_by_tokens[key] = lexer | ||||
self.lexers[state] = lexer | self.lexers[state] = lexer | ||||
assert trad_conf.tokens is terminals | |||||
assert trad_conf.terminals is terminals | |||||
self.root_lexer = TraditionalLexer(trad_conf) | self.root_lexer = TraditionalLexer(trad_conf) | ||||
def make_lexer_state(self, text): | def make_lexer_state(self, text): | ||||
@@ -428,7 +430,7 @@ class ContextualLexer(Lexer): | |||||
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | ||||
# This tests the input against the global context, to provide a nicer error. | # This tests the input against the global context, to provide a nicer error. | ||||
token = self.root_lexer.next_token(lexer_state) | token = self.root_lexer.next_token(lexer_state) | ||||
raise UnexpectedToken(token, e.allowed, state=parser_state.position) | |||||
raise UnexpectedToken(token, e.allowed, state=parser_state.position, all_terminals=self.root_lexer.terminals) | |||||
class LexerThread: | class LexerThread: | ||||
"A thread that ties a lexer instance and a lexer state, to be used by the parser" | "A thread that ties a lexer instance and a lexer state, to be used by the parser" | ||||
@@ -5,7 +5,7 @@ import sys | |||||
from copy import copy, deepcopy | from copy import copy, deepcopy | ||||
from io import open | from io import open | ||||
from .utils import bfs, eval_escaping, Py36, logger, classify_bool | |||||
from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isascii | |||||
from .lexer import Token, TerminalDef, PatternStr, PatternRE | from .lexer import Token, TerminalDef, PatternStr, PatternRE | ||||
from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
@@ -317,8 +317,11 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||||
raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) | raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) | ||||
term_name = None | term_name = None | ||||
nice_print = None | |||||
if isinstance(p, PatternStr): | if isinstance(p, PatternStr): | ||||
nice_print = repr(value) # This will always be ok, independent of what term_name we end up using | |||||
# TODO: potentially try to get the actual source code, and not the repr | |||||
try: | try: | ||||
# If already defined, use the user-defined terminal name | # If already defined, use the user-defined terminal name | ||||
term_name = self.term_reverse[p].name | term_name = self.term_reverse[p].name | ||||
@@ -327,15 +330,14 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||||
try: | try: | ||||
term_name = _TERMINAL_NAMES[value] | term_name = _TERMINAL_NAMES[value] | ||||
except KeyError: | except KeyError: | ||||
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set: | |||||
with suppress(UnicodeEncodeError): | |||||
value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names | |||||
term_name = value.upper() | |||||
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set and isascii(value): | |||||
term_name = value.upper() | |||||
if term_name in self.term_set: | if term_name in self.term_set: | ||||
term_name = None | term_name = None | ||||
elif isinstance(p, PatternRE): | elif isinstance(p, PatternRE): | ||||
#TODO: generate nice_print | |||||
if p in self.term_reverse: # Kind of a weird placement.name | if p in self.term_reverse: # Kind of a weird placement.name | ||||
term_name = self.term_reverse[p].name | term_name = self.term_reverse[p].name | ||||
else: | else: | ||||
@@ -348,7 +350,7 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||||
if term_name not in self.term_set: | if term_name not in self.term_set: | ||||
assert p not in self.term_reverse | assert p not in self.term_reverse | ||||
self.term_set.add(term_name) | self.term_set.add(term_name) | ||||
termdef = TerminalDef(term_name, p) | |||||
termdef = TerminalDef(term_name, p, nice_print=nice_print) | |||||
self.term_reverse[p] = termdef | self.term_reverse[p] = termdef | ||||
self.terminals.append(termdef) | self.terminals.append(termdef) | ||||
@@ -6,6 +6,7 @@ from .parsers.lalr_parser import LALR_Parser | |||||
from .grammar import Rule | from .grammar import Rule | ||||
from .tree import Tree | from .tree import Tree | ||||
from .common import LexerConf | from .common import LexerConf | ||||
from .exceptions import UnexpectedInput | |||||
try: | try: | ||||
import regex | import regex | ||||
except ImportError: | except ImportError: | ||||
@@ -135,7 +136,12 @@ class WithLexer(_ParserFrontend): | |||||
return LexerThread(lexer, text) | return LexerThread(lexer, text) | ||||
def parse(self, text, start=None): | def parse(self, text, start=None): | ||||
return self._parse(start, self.make_lexer(text)) | |||||
try: | |||||
return self._parse(start, self.make_lexer(text)) | |||||
except UnexpectedInput as e: | |||||
if e._all_terminals is None: | |||||
e._all_terminals = self.lexer_conf.terminals | |||||
raise e | |||||
def init_traditional_lexer(self): | def init_traditional_lexer(self): | ||||
self.lexer = TraditionalLexer(self.lexer_conf) | self.lexer = TraditionalLexer(self.lexer_conf) | ||||
@@ -190,7 +196,7 @@ class Earley(WithLexer): | |||||
class XEarley(_ParserFrontend): | class XEarley(_ParserFrontend): | ||||
def __init__(self, lexer_conf, parser_conf, options=None, **kw): | def __init__(self, lexer_conf, parser_conf, options=None, **kw): | ||||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||||
self.token_by_name = {t.name:t for t in lexer_conf.terminals} | |||||
self.start = parser_conf.start | self.start = parser_conf.start | ||||
self._prepare_match(lexer_conf) | self._prepare_match(lexer_conf) | ||||
@@ -211,7 +217,7 @@ class XEarley(_ParserFrontend): | |||||
def _prepare_match(self, lexer_conf): | def _prepare_match(self, lexer_conf): | ||||
self.regexps = {} | self.regexps = {} | ||||
for t in lexer_conf.tokens: | |||||
for t in lexer_conf.terminals: | |||||
if t.priority != 1: | if t.priority != 1: | ||||
raise ValueError("Dynamic Earley doesn't support weights on terminals", t, t.priority) | raise ValueError("Dynamic Earley doesn't support weights on terminals", t, t.priority) | ||||
regexp = t.pattern.to_regexp() | regexp = t.pattern.to_regexp() | ||||
@@ -228,7 +234,12 @@ class XEarley(_ParserFrontend): | |||||
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | ||||
def parse(self, text, start): | def parse(self, text, start): | ||||
return self._parse(start, text) | |||||
try: | |||||
return self._parse(start, text) | |||||
except UnexpectedInput as e: | |||||
if e._all_terminals is None: | |||||
e._all_terminals = self.token_by_name.values() | |||||
raise e | |||||
class XEarley_CompleteLex(XEarley): | class XEarley_CompleteLex(XEarley): | ||||
def __init__(self, *args, **kw): | def __init__(self, *args, **kw): | ||||
@@ -35,7 +35,7 @@ class LALR_Parser(object): | |||||
return self.parser.parse(*args) | return self.parser.parse(*args) | ||||
class ParseConf: | |||||
class ParseConf(object): | |||||
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | ||||
def __init__(self, parse_table, callbacks, start): | def __init__(self, parse_table, callbacks, start): | ||||
@@ -49,7 +49,7 @@ class ParseConf: | |||||
self.start = start | self.start = start | ||||
class ParserState: | |||||
class ParserState(object): | |||||
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | ||||
def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | ||||
@@ -117,7 +117,7 @@ class ParserState: | |||||
if is_end and state_stack[-1] == end_state: | if is_end and state_stack[-1] == end_state: | ||||
return value_stack[-1] | return value_stack[-1] | ||||
class _Parser: | |||||
class _Parser(object): | |||||
def __init__(self, parse_table, callbacks, debug=False): | def __init__(self, parse_table, callbacks, debug=False): | ||||
self.parse_table = parse_table | self.parse_table = parse_table | ||||
self.callbacks = callbacks | self.callbacks = callbacks | ||||
@@ -60,7 +60,7 @@ class ParserPuppet(object): | |||||
Updated by ``feed_token()``. | Updated by ``feed_token()``. | ||||
""" | """ | ||||
return self.parser_state.parse_table.states[self.parser_state.position] | |||||
return self.parser_state.parse_conf.parse_table.states[self.parser_state.position] | |||||
def accepts(self): | def accepts(self): | ||||
accepts = set() | accepts = set() | ||||