| @@ -12,6 +12,7 @@ _T = TypeVar('_T') | |||
| class Pattern(ABC): | |||
| value: str | |||
| flags: Collection[str] | |||
| raw: str | |||
| def __init__(self, value: str, flags: Collection[str] = ...): | |||
| ... | |||
| @@ -73,6 +74,8 @@ class TerminalDef: | |||
| def __init__(self, name: str, pattern: Pattern, priority: int = ...): | |||
| ... | |||
| def user_repr(self) -> str: ... | |||
| class Token(str): | |||
| @@ -1,3 +1,5 @@ | |||
| from warnings import warn | |||
| from .utils import Serialize | |||
| from .lexer import TerminalDef | |||
| @@ -5,11 +7,13 @@ from .lexer import TerminalDef | |||
| class LexerConf(Serialize): | |||
| __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' | |||
| __serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' | |||
| __serialize_namespace__ = TerminalDef, | |||
| def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||
| self.tokens = tokens # TODO should be terminals | |||
| def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||
| self.terminals = terminals | |||
| self.terminals_by_name = {t.name: t for t in self.terminals} | |||
| assert len(self.terminals) == len(self.terminals_by_name) | |||
| self.ignore = ignore | |||
| self.postlex = postlex | |||
| self.callbacks = callbacks or {} | |||
| @@ -17,9 +21,17 @@ class LexerConf(Serialize): | |||
| self.re_module = re_module | |||
| self.skip_validation = skip_validation | |||
| self.use_bytes = use_bytes | |||
| self.lexer_type = None | |||
| @property | |||
| def tokens(self): | |||
| warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) | |||
| return self.terminals | |||
| def _deserialize(self): | |||
| self.terminals_by_name = {t.name: t for t in self.terminals} | |||
| class ParserConf(Serialize): | |||
| __serialize_fields__ = 'rules', 'start', 'parser_type' | |||
| @@ -1,4 +1,5 @@ | |||
| from .utils import STRING_TYPE, logger | |||
| from .utils import STRING_TYPE, logger, NO_VALUE | |||
| ###{standalone | |||
| @@ -39,6 +40,7 @@ class UnexpectedInput(LarkError): | |||
| After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | |||
| """ | |||
| pos_in_stream = None | |||
| _terminals_by_name = None | |||
| def get_context(self, text, span=40): | |||
| """Returns a pretty string pinpointing the error in the text, | |||
| @@ -95,7 +97,7 @@ class UnexpectedInput(LarkError): | |||
| if ut.state == self.state: | |||
| if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: | |||
| logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % | |||
| (self.state, self.accepts, ut.accepts, i, j)) | |||
| (self.state, self.accepts, ut.accepts, i, j)) | |||
| continue | |||
| try: | |||
| if ut.token == self.token: # Try exact match first | |||
| @@ -116,44 +118,61 @@ class UnexpectedInput(LarkError): | |||
| return candidate[0] | |||
| def _format_expected(self, expected): | |||
| if self._terminals_by_name: | |||
| expected = [self._terminals_by_name[t_name].user_repr() for t_name in expected] | |||
| return "Expected one of: \n\t* %s\n" % '\n\t* '.join(expected) | |||
| class UnexpectedEOF(ParseError, UnexpectedInput): | |||
| def __init__(self, expected, state=None): | |||
| def __init__(self, expected, state=None, terminals_by_name=None): | |||
| self.expected = expected | |||
| self.state = state | |||
| from .lexer import Token | |||
| self.token = Token("<EOF>", "") #, line=-1, column=-1, pos_in_stream=-1) | |||
| self.token = Token("<EOF>", "") # , line=-1, column=-1, pos_in_stream=-1) | |||
| self.pos_in_stream = -1 | |||
| self.line = -1 | |||
| self.column = -1 | |||
| self._terminals_by_name = terminals_by_name | |||
| message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) | |||
| super(UnexpectedEOF, self).__init__(message) | |||
| super(UnexpectedEOF, self).__init__() | |||
| def __str__(self): | |||
| message = "Unexpected end-of-input. " | |||
| message += self._format_expected(self.expected) | |||
| return message | |||
| class UnexpectedCharacters(LexError, UnexpectedInput): | |||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, | |||
| terminals_by_name=None): | |||
| # TODO considered_tokens and allowed can be figured out using state | |||
| self.line = line | |||
| self.column = column | |||
| self.pos_in_stream = lex_pos | |||
| self.state = state | |||
| self._terminals_by_name = terminals_by_name | |||
| self.allowed = allowed | |||
| self.considered_tokens = considered_tokens | |||
| self.token_history = token_history | |||
| if isinstance(seq, bytes): | |||
| _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||
| self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace") | |||
| else: | |||
| _s = seq[lex_pos] | |||
| self.char = seq[lex_pos] | |||
| self._context = self.get_context(seq) | |||
| message = "No terminal defined for %r at line %d col %d" % (_s, line, column) | |||
| message += '\n\n' + self.get_context(seq) | |||
| if allowed: | |||
| message += '\nExpecting: %s\n' % allowed | |||
| if token_history: | |||
| message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) | |||
| super(UnexpectedCharacters, self).__init__() | |||
| super(UnexpectedCharacters, self).__init__(message) | |||
| def __str__(self): | |||
| message = "No terminal defined for '%s' at line %d col %d" % (self.char, self.line, self.column) | |||
| message += '\n\n' + self._context | |||
| if self.allowed: | |||
| message += self._format_expected(self.allowed) | |||
| if self.token_history: | |||
| message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) | |||
| return message | |||
| class UnexpectedToken(ParseError, UnexpectedInput): | |||
| @@ -163,7 +182,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
| see: :ref:`ParserPuppet`. | |||
| """ | |||
| def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None): | |||
| def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, terminals_by_name=None, token_history=None): | |||
| # TODO considered_rules and expected can be figured out using state | |||
| self.line = getattr(token, 'line', '?') | |||
| self.column = getattr(token, 'column', '?') | |||
| @@ -171,23 +191,28 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
| self.state = state | |||
| self.token = token | |||
| self.expected = expected # XXX deprecate? `accepts` is better | |||
| self.expected = expected # XXX deprecate? `accepts` is better | |||
| self._accepts = NO_VALUE | |||
| self.considered_rules = considered_rules | |||
| self.puppet = puppet | |||
| self._terminals_by_name = terminals_by_name | |||
| self.token_history = token_history | |||
| # TODO Only calculate `accepts()` when we need to display it to the user | |||
| # This will improve performance when doing automatic error handling | |||
| self.accepts = puppet and puppet.accepts() | |||
| super(UnexpectedToken, self).__init__() | |||
| message = ("Unexpected token %r at line %s, column %s.\n" | |||
| "Expected one of: \n\t* %s\n" | |||
| % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||
| @property | |||
| def accepts(self): | |||
| if self._accepts is NO_VALUE: | |||
| self._accepts = self.puppet and self.puppet.accepts() | |||
| return self._accepts | |||
| def __str__(self): | |||
| message = ("Unexpected token %r at line %s, column %s.\n%s" | |||
| % (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected))) | |||
| if self.token_history: | |||
| message += "Previous tokens: %r\n" % token_history | |||
| message += "Previous tokens: %r\n" % self.token_history | |||
| super(UnexpectedToken, self).__init__(message) | |||
| return message | |||
| class VisitError(LarkError): | |||
| @@ -197,6 +222,7 @@ class VisitError(LarkError): | |||
| - obj: the tree node or token it was processing when the exception was raised | |||
| - orig_exc: the exception that cause it to fail | |||
| """ | |||
| def __init__(self, rule, obj, orig_exc): | |||
| self.obj = obj | |||
| self.orig_exc = orig_exc | |||
| @@ -204,5 +230,4 @@ class VisitError(LarkError): | |||
| message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) | |||
| super(VisitError, self).__init__(message) | |||
| ###} | |||
| @@ -416,7 +416,7 @@ class Lark(Serialize): | |||
| self._callbacks, | |||
| self.options, # Not all, but multiple attributes are used | |||
| ) | |||
| self.terminals = self.parser.lexer_conf.tokens | |||
| self.terminals = self.parser.lexer_conf.terminals | |||
| self._terminals_dict = {t.name: t for t in self.terminals} | |||
| return self | |||
| @@ -11,9 +11,10 @@ from copy import copy | |||
| class Pattern(Serialize): | |||
| def __init__(self, value, flags=()): | |||
| def __init__(self, value, flags=(), raw=None): | |||
| self.value = value | |||
| self.flags = frozenset(flags) | |||
| self.raw = raw | |||
| def __repr__(self): | |||
| return repr(self.to_regexp()) | |||
| @@ -92,6 +93,12 @@ class TerminalDef(Serialize): | |||
| def __repr__(self): | |||
| return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | |||
| def user_repr(self): | |||
| if self.name.startswith('__'): # We represent a generated terminal | |||
| return self.pattern.raw or self.name | |||
| else: | |||
| return self.name | |||
| class Token(Str): | |||
| """A string with meta-information, that is produced by the lexer. | |||
| @@ -283,7 +290,7 @@ class Lexer(object): | |||
| class TraditionalLexer(Lexer): | |||
| def __init__(self, conf): | |||
| terminals = list(conf.tokens) | |||
| terminals = list(conf.terminals) | |||
| assert all(isinstance(t, TerminalDef) for t in terminals), terminals | |||
| self.re = conf.re_module | |||
| @@ -310,6 +317,7 @@ class TraditionalLexer(Lexer): | |||
| self.user_callbacks = conf.callbacks | |||
| self.g_regex_flags = conf.g_regex_flags | |||
| self.use_bytes = conf.use_bytes | |||
| self.terminals_by_name = conf.terminals_by_name | |||
| self._mres = None | |||
| @@ -353,7 +361,7 @@ class TraditionalLexer(Lexer): | |||
| allowed = {"<END-OF-FILE>"} | |||
| raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | |||
| allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], | |||
| state=parser_state) | |||
| state=parser_state, terminals_by_name=self.terminals_by_name) | |||
| value, type_ = res | |||
| @@ -394,14 +402,11 @@ class LexerState: | |||
| class ContextualLexer(Lexer): | |||
| def __init__(self, conf, states, always_accept=()): | |||
| terminals = list(conf.tokens) | |||
| tokens_by_name = {} | |||
| for t in terminals: | |||
| assert t.name not in tokens_by_name, t | |||
| tokens_by_name[t.name] = t | |||
| terminals = list(conf.terminals) | |||
| terminals_by_name = conf.terminals_by_name | |||
| trad_conf = copy(conf) | |||
| trad_conf.tokens = terminals | |||
| trad_conf.terminals = terminals | |||
| lexer_by_tokens = {} | |||
| self.lexers = {} | |||
| @@ -411,15 +416,14 @@ class ContextualLexer(Lexer): | |||
| lexer = lexer_by_tokens[key] | |||
| except KeyError: | |||
| accepts = set(accepts) | set(conf.ignore) | set(always_accept) | |||
| state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | |||
| lexer_conf = copy(trad_conf) | |||
| lexer_conf.tokens = state_tokens | |||
| lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] | |||
| lexer = TraditionalLexer(lexer_conf) | |||
| lexer_by_tokens[key] = lexer | |||
| self.lexers[state] = lexer | |||
| assert trad_conf.tokens is terminals | |||
| assert trad_conf.terminals is terminals | |||
| self.root_lexer = TraditionalLexer(trad_conf) | |||
| def make_lexer_state(self, text): | |||
| @@ -435,9 +439,12 @@ class ContextualLexer(Lexer): | |||
| except UnexpectedCharacters as e: | |||
| # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | |||
| # This tests the input against the global context, to provide a nicer error. | |||
| token = self.root_lexer.next_token(lexer_state, parser_state) | |||
| raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token]) | |||
| try: | |||
| last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token | |||
| token = self.root_lexer.next_token(lexer_state, parser_state) | |||
| raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name) | |||
| except UnexpectedCharacters: | |||
| raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. | |||
| class LexerThread: | |||
| """A thread that ties a lexer instance and a lexer state, to be used by the parser""" | |||
| @@ -454,9 +454,9 @@ def _literal_to_pattern(literal): | |||
| if literal.type == 'STRING': | |||
| s = s.replace('\\\\', '\\') | |||
| return PatternStr(s, flags) | |||
| return PatternStr(s, flags, raw=literal.value) | |||
| elif literal.type == 'REGEXP': | |||
| return PatternRE(s, flags) | |||
| return PatternRE(s, flags, raw=literal.value) | |||
| else: | |||
| assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' | |||
| @@ -170,7 +170,7 @@ CYK_FrontEnd = NotImplemented | |||
| class EarleyRegexpMatcher: | |||
| def __init__(self, lexer_conf): | |||
| self.regexps = {} | |||
| for t in lexer_conf.tokens: | |||
| for t in lexer_conf.terminals: | |||
| if t.priority != 1: | |||
| raise GrammarError("Dynamic Earley doesn't support weights on terminals", t, t.priority) | |||
| regexp = t.pattern.to_regexp() | |||
| @@ -302,7 +302,7 @@ class Parser: | |||
| # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | |||
| solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | |||
| if not solutions: | |||
| expected_terminals = [t.expect for t in to_scan] | |||
| expected_terminals = [t.expect.name for t in to_scan] | |||
| raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) | |||
| if self.debug: | |||
| @@ -36,7 +36,7 @@ class LALR_Parser(Serialize): | |||
| return self.parser.parse(*args) | |||
| class ParseConf: | |||
| class ParseConf(object): | |||
| __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | |||
| def __init__(self, parse_table, callbacks, start): | |||
| @@ -50,7 +50,7 @@ class ParseConf: | |||
| self.start = start | |||
| class ParserState: | |||
| class ParserState(object): | |||
| __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | |||
| def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | |||
| @@ -124,7 +124,7 @@ class ParserState: | |||
| if is_end and state_stack[-1] == end_state: | |||
| return value_stack[-1] | |||
| class _Parser: | |||
| class _Parser(object): | |||
| def __init__(self, parse_table, callbacks, debug=False): | |||
| self.parse_table = parse_table | |||
| self.callbacks = callbacks | |||
| @@ -13,6 +13,7 @@ logger.setLevel(logging.CRITICAL) | |||
| Py36 = (sys.version_info[:2] >= (3, 6)) | |||
| NO_VALUE = object() | |||
| def classify(seq, key=None, value=None): | |||
| d = {} | |||