@@ -12,6 +12,7 @@ _T = TypeVar('_T') | |||
class Pattern(ABC): | |||
value: str | |||
flags: Collection[str] | |||
raw: str | |||
def __init__(self, value: str, flags: Collection[str] = ...): | |||
... | |||
@@ -73,6 +74,8 @@ class TerminalDef: | |||
def __init__(self, name: str, pattern: Pattern, priority: int = ...): | |||
... | |||
def user_repr(self) -> str: ... | |||
class Token(str): | |||
@@ -1,3 +1,5 @@ | |||
from warnings import warn | |||
from .utils import Serialize | |||
from .lexer import TerminalDef | |||
@@ -5,11 +7,13 @@ from .lexer import TerminalDef | |||
class LexerConf(Serialize): | |||
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' | |||
__serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' | |||
__serialize_namespace__ = TerminalDef, | |||
def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||
self.tokens = tokens # TODO should be terminals | |||
def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||
self.terminals = terminals | |||
self.terminals_by_name = {t.name: t for t in self.terminals} | |||
assert len(self.terminals) == len(self.terminals_by_name) | |||
self.ignore = ignore | |||
self.postlex = postlex | |||
self.callbacks = callbacks or {} | |||
@@ -17,9 +21,17 @@ class LexerConf(Serialize): | |||
self.re_module = re_module | |||
self.skip_validation = skip_validation | |||
self.use_bytes = use_bytes | |||
self.lexer_type = None | |||
@property | |||
def tokens(self): | |||
warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) | |||
return self.terminals | |||
def _deserialize(self): | |||
self.terminals_by_name = {t.name: t for t in self.terminals} | |||
class ParserConf(Serialize): | |||
__serialize_fields__ = 'rules', 'start', 'parser_type' | |||
@@ -1,4 +1,5 @@ | |||
from .utils import STRING_TYPE, logger | |||
from .utils import STRING_TYPE, logger, NO_VALUE | |||
###{standalone | |||
@@ -39,6 +40,7 @@ class UnexpectedInput(LarkError): | |||
After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | |||
""" | |||
pos_in_stream = None | |||
_terminals_by_name = None | |||
def get_context(self, text, span=40): | |||
"""Returns a pretty string pinpointing the error in the text, | |||
@@ -95,7 +97,7 @@ class UnexpectedInput(LarkError): | |||
if ut.state == self.state: | |||
if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: | |||
logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % | |||
(self.state, self.accepts, ut.accepts, i, j)) | |||
(self.state, self.accepts, ut.accepts, i, j)) | |||
continue | |||
try: | |||
if ut.token == self.token: # Try exact match first | |||
@@ -116,44 +118,61 @@ class UnexpectedInput(LarkError): | |||
return candidate[0] | |||
def _format_expected(self, expected): | |||
if self._terminals_by_name: | |||
expected = [self._terminals_by_name[t_name].user_repr() for t_name in expected] | |||
return "Expected one of: \n\t* %s\n" % '\n\t* '.join(expected) | |||
class UnexpectedEOF(ParseError, UnexpectedInput): | |||
def __init__(self, expected, state=None): | |||
def __init__(self, expected, state=None, terminals_by_name=None): | |||
self.expected = expected | |||
self.state = state | |||
from .lexer import Token | |||
self.token = Token("<EOF>", "") #, line=-1, column=-1, pos_in_stream=-1) | |||
self.token = Token("<EOF>", "") # , line=-1, column=-1, pos_in_stream=-1) | |||
self.pos_in_stream = -1 | |||
self.line = -1 | |||
self.column = -1 | |||
self._terminals_by_name = terminals_by_name | |||
message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) | |||
super(UnexpectedEOF, self).__init__(message) | |||
super(UnexpectedEOF, self).__init__() | |||
def __str__(self): | |||
message = "Unexpected end-of-input. " | |||
message += self._format_expected(self.expected) | |||
return message | |||
class UnexpectedCharacters(LexError, UnexpectedInput): | |||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, | |||
terminals_by_name=None): | |||
# TODO considered_tokens and allowed can be figured out using state | |||
self.line = line | |||
self.column = column | |||
self.pos_in_stream = lex_pos | |||
self.state = state | |||
self._terminals_by_name = terminals_by_name | |||
self.allowed = allowed | |||
self.considered_tokens = considered_tokens | |||
self.token_history = token_history | |||
if isinstance(seq, bytes): | |||
_s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||
self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace") | |||
else: | |||
_s = seq[lex_pos] | |||
self.char = seq[lex_pos] | |||
self._context = self.get_context(seq) | |||
message = "No terminal defined for %r at line %d col %d" % (_s, line, column) | |||
message += '\n\n' + self.get_context(seq) | |||
if allowed: | |||
message += '\nExpecting: %s\n' % allowed | |||
if token_history: | |||
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) | |||
super(UnexpectedCharacters, self).__init__() | |||
super(UnexpectedCharacters, self).__init__(message) | |||
def __str__(self): | |||
message = "No terminal defined for '%s' at line %d col %d" % (self.char, self.line, self.column) | |||
message += '\n\n' + self._context | |||
if self.allowed: | |||
message += self._format_expected(self.allowed) | |||
if self.token_history: | |||
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) | |||
return message | |||
class UnexpectedToken(ParseError, UnexpectedInput): | |||
@@ -163,7 +182,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
see: :ref:`ParserPuppet`. | |||
""" | |||
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None): | |||
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, terminals_by_name=None, token_history=None): | |||
# TODO considered_rules and expected can be figured out using state | |||
self.line = getattr(token, 'line', '?') | |||
self.column = getattr(token, 'column', '?') | |||
@@ -171,23 +191,28 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
self.state = state | |||
self.token = token | |||
self.expected = expected # XXX deprecate? `accepts` is better | |||
self.expected = expected # XXX deprecate? `accepts` is better | |||
self._accepts = NO_VALUE | |||
self.considered_rules = considered_rules | |||
self.puppet = puppet | |||
self._terminals_by_name = terminals_by_name | |||
self.token_history = token_history | |||
# TODO Only calculate `accepts()` when we need to display it to the user | |||
# This will improve performance when doing automatic error handling | |||
self.accepts = puppet and puppet.accepts() | |||
super(UnexpectedToken, self).__init__() | |||
message = ("Unexpected token %r at line %s, column %s.\n" | |||
"Expected one of: \n\t* %s\n" | |||
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||
@property | |||
def accepts(self): | |||
if self._accepts is NO_VALUE: | |||
self._accepts = self.puppet and self.puppet.accepts() | |||
return self._accepts | |||
def __str__(self): | |||
message = ("Unexpected token %r at line %s, column %s.\n%s" | |||
% (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected))) | |||
if self.token_history: | |||
message += "Previous tokens: %r\n" % token_history | |||
message += "Previous tokens: %r\n" % self.token_history | |||
super(UnexpectedToken, self).__init__(message) | |||
return message | |||
class VisitError(LarkError): | |||
@@ -197,6 +222,7 @@ class VisitError(LarkError): | |||
- obj: the tree node or token it was processing when the exception was raised | |||
- orig_exc: the exception that cause it to fail | |||
""" | |||
def __init__(self, rule, obj, orig_exc): | |||
self.obj = obj | |||
self.orig_exc = orig_exc | |||
@@ -204,5 +230,4 @@ class VisitError(LarkError): | |||
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) | |||
super(VisitError, self).__init__(message) | |||
###} |
@@ -416,7 +416,7 @@ class Lark(Serialize): | |||
self._callbacks, | |||
self.options, # Not all, but multiple attributes are used | |||
) | |||
self.terminals = self.parser.lexer_conf.tokens | |||
self.terminals = self.parser.lexer_conf.terminals | |||
self._terminals_dict = {t.name: t for t in self.terminals} | |||
return self | |||
@@ -11,9 +11,10 @@ from copy import copy | |||
class Pattern(Serialize): | |||
def __init__(self, value, flags=()): | |||
def __init__(self, value, flags=(), raw=None): | |||
self.value = value | |||
self.flags = frozenset(flags) | |||
self.raw = raw | |||
def __repr__(self): | |||
return repr(self.to_regexp()) | |||
@@ -92,6 +93,12 @@ class TerminalDef(Serialize): | |||
def __repr__(self): | |||
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | |||
def user_repr(self): | |||
if self.name.startswith('__'): # We represent a generated terminal | |||
return self.pattern.raw or self.name | |||
else: | |||
return self.name | |||
class Token(Str): | |||
"""A string with meta-information, that is produced by the lexer. | |||
@@ -283,7 +290,7 @@ class Lexer(object): | |||
class TraditionalLexer(Lexer): | |||
def __init__(self, conf): | |||
terminals = list(conf.tokens) | |||
terminals = list(conf.terminals) | |||
assert all(isinstance(t, TerminalDef) for t in terminals), terminals | |||
self.re = conf.re_module | |||
@@ -310,6 +317,7 @@ class TraditionalLexer(Lexer): | |||
self.user_callbacks = conf.callbacks | |||
self.g_regex_flags = conf.g_regex_flags | |||
self.use_bytes = conf.use_bytes | |||
self.terminals_by_name = conf.terminals_by_name | |||
self._mres = None | |||
@@ -353,7 +361,7 @@ class TraditionalLexer(Lexer): | |||
allowed = {"<END-OF-FILE>"} | |||
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | |||
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], | |||
state=parser_state) | |||
state=parser_state, terminals_by_name=self.terminals_by_name) | |||
value, type_ = res | |||
@@ -394,14 +402,11 @@ class LexerState: | |||
class ContextualLexer(Lexer): | |||
def __init__(self, conf, states, always_accept=()): | |||
terminals = list(conf.tokens) | |||
tokens_by_name = {} | |||
for t in terminals: | |||
assert t.name not in tokens_by_name, t | |||
tokens_by_name[t.name] = t | |||
terminals = list(conf.terminals) | |||
terminals_by_name = conf.terminals_by_name | |||
trad_conf = copy(conf) | |||
trad_conf.tokens = terminals | |||
trad_conf.terminals = terminals | |||
lexer_by_tokens = {} | |||
self.lexers = {} | |||
@@ -411,15 +416,14 @@ class ContextualLexer(Lexer): | |||
lexer = lexer_by_tokens[key] | |||
except KeyError: | |||
accepts = set(accepts) | set(conf.ignore) | set(always_accept) | |||
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | |||
lexer_conf = copy(trad_conf) | |||
lexer_conf.tokens = state_tokens | |||
lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] | |||
lexer = TraditionalLexer(lexer_conf) | |||
lexer_by_tokens[key] = lexer | |||
self.lexers[state] = lexer | |||
assert trad_conf.tokens is terminals | |||
assert trad_conf.terminals is terminals | |||
self.root_lexer = TraditionalLexer(trad_conf) | |||
def make_lexer_state(self, text): | |||
@@ -435,9 +439,12 @@ class ContextualLexer(Lexer): | |||
except UnexpectedCharacters as e: | |||
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | |||
# This tests the input against the global context, to provide a nicer error. | |||
token = self.root_lexer.next_token(lexer_state, parser_state) | |||
raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token]) | |||
try: | |||
last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token | |||
token = self.root_lexer.next_token(lexer_state, parser_state) | |||
raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name) | |||
except UnexpectedCharacters: | |||
raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. | |||
class LexerThread: | |||
"""A thread that ties a lexer instance and a lexer state, to be used by the parser""" | |||
@@ -454,9 +454,9 @@ def _literal_to_pattern(literal): | |||
if literal.type == 'STRING': | |||
s = s.replace('\\\\', '\\') | |||
return PatternStr(s, flags) | |||
return PatternStr(s, flags, raw=literal.value) | |||
elif literal.type == 'REGEXP': | |||
return PatternRE(s, flags) | |||
return PatternRE(s, flags, raw=literal.value) | |||
else: | |||
assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' | |||
@@ -170,7 +170,7 @@ CYK_FrontEnd = NotImplemented | |||
class EarleyRegexpMatcher: | |||
def __init__(self, lexer_conf): | |||
self.regexps = {} | |||
for t in lexer_conf.tokens: | |||
for t in lexer_conf.terminals: | |||
if t.priority != 1: | |||
raise GrammarError("Dynamic Earley doesn't support weights on terminals", t, t.priority) | |||
regexp = t.pattern.to_regexp() | |||
@@ -302,7 +302,7 @@ class Parser: | |||
# this column. Find the item for the start_symbol, which is the root of the SPPF tree. | |||
solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | |||
if not solutions: | |||
expected_terminals = [t.expect for t in to_scan] | |||
expected_terminals = [t.expect.name for t in to_scan] | |||
raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) | |||
if self.debug: | |||
@@ -36,7 +36,7 @@ class LALR_Parser(Serialize): | |||
return self.parser.parse(*args) | |||
class ParseConf: | |||
class ParseConf(object): | |||
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | |||
def __init__(self, parse_table, callbacks, start): | |||
@@ -50,7 +50,7 @@ class ParseConf: | |||
self.start = start | |||
class ParserState: | |||
class ParserState(object): | |||
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | |||
def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | |||
@@ -124,7 +124,7 @@ class ParserState: | |||
if is_end and state_stack[-1] == end_state: | |||
return value_stack[-1] | |||
class _Parser: | |||
class _Parser(object): | |||
def __init__(self, parse_table, callbacks, debug=False): | |||
self.parse_table = parse_table | |||
self.callbacks = callbacks | |||
@@ -13,6 +13,7 @@ logger.setLevel(logging.CRITICAL) | |||
Py36 = (sys.version_info[:2] >= (3, 6)) | |||
NO_VALUE = object() | |||
def classify(seq, key=None, value=None): | |||
d = {} | |||