Selaa lähdekoodia

Correction for PR

- `user_repr` is now a method
- Fix for python 2.7
- excepts -> expected
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
MegaIng1 3 vuotta sitten
committed by Erez Sh
vanhempi
commit
8ec6d0f2ab
6 muutettua tiedostoa jossa 68 lisäystä ja 52 poistoa
  1. +3
    -0
      lark-stubs/lexer.pyi
  2. +5
    -0
      lark/common.py
  3. +40
    -39
      lark/exceptions.py
  4. +18
    -10
      lark/lexer.py
  5. +1
    -2
      lark/load_grammar.py
  6. +1
    -1
      lark/parsers/earley.py

+ 3
- 0
lark-stubs/lexer.pyi Näytä tiedosto

@@ -12,6 +12,7 @@ _T = TypeVar('_T')
class Pattern(ABC):
value: str
flags: Collection[str]
raw: str

def __init__(self, value: str, flags: Collection[str] = ...):
...
@@ -73,6 +74,8 @@ class TerminalDef:

def __init__(self, name: str, pattern: Pattern, priority: int = ...):
...
def user_repr(self) -> str: ...


class Token(str):


+ 5
- 0
lark/common.py Näytä tiedosto

@@ -12,6 +12,8 @@ class LexerConf(Serialize):

def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
self.terminals = terminals
self.terminals_by_names = {t.name: t for t in self.terminals}
assert len(self.terminals) == len(self.terminals_by_names)
self.ignore = ignore
self.postlex = postlex
self.callbacks = callbacks or {}
@@ -25,6 +27,9 @@ class LexerConf(Serialize):
def tokens(self):
warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning)
return self.terminals
def _deserialize(self):
self.terminals_by_names = {t.name: t for t in self.terminals}





+ 40
- 39
lark/exceptions.py Näytä tiedosto

@@ -1,5 +1,6 @@
from .utils import STRING_TYPE, logger


###{standalone


@@ -39,7 +40,7 @@ class UnexpectedInput(LarkError):
After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
"""
pos_in_stream = None
_all_terminals = None
_terminals_by_name = None

def get_context(self, text, span=40):
"""Returns a pretty string pinpointing the error in the text,
@@ -96,7 +97,7 @@ class UnexpectedInput(LarkError):
if ut.state == self.state:
if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts:
logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
(self.state, self.accepts, ut.accepts, i, j))
(self.state, self.accepts, ut.accepts, i, j))
continue
try:
if ut.token == self.token: # Try exact match first
@@ -116,71 +117,69 @@ class UnexpectedInput(LarkError):
candidate = label, False

return candidate[0]
def _format_terminals(self, names):
if self._all_terminals:
if isinstance(self._all_terminals, list):
self._all_terminals = {t.name: t for t in self._all_terminals}

def _format_expected(self, expected):
if self._terminals_by_name:
ts = []
for name in names:
try:
ts.append(self._all_terminals[name].user_repr)
except StopIteration:
# If we don't find the corresponding Terminal (which *should* never happen), don't error.
# Broken __str__ for Exception are some of the worst bugs
ts.append(name)
for ter in expected:
ts.append(self._terminals_by_name[ter].user_repr())
else:
ts = names
ts = expected
return "Expected one of: \n\t* %s\n" % '\n\t* '.join(ts)


class UnexpectedEOF(ParseError, UnexpectedInput):
def __init__(self, expected, state=None):
def __init__(self, expected, state=None, terminals_by_name=None):
self.expected = expected
self.state = state
from .lexer import Token
self.token = Token("<EOF>", "") #, line=-1, column=-1, pos_in_stream=-1)
self.token = Token("<EOF>", "") # , line=-1, column=-1, pos_in_stream=-1)
self.pos_in_stream = -1
self.line = -1
self.column = -1
self._terminals_by_name = terminals_by_name

message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
super(UnexpectedEOF, self).__init__(message)
super(UnexpectedEOF, self).__init__()

def __str__(self):
message = "Unexpected end-of-input. "
message += self._format_expected(self.expected)
return message


class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, _all_terminals=None):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
terminals_by_name=None):
# TODO considered_tokens and allowed can be figured out using state
self.line = line
self.column = column
self.pos_in_stream = lex_pos
self.state = state
self._all_terminals = _all_terminals
self._terminals_by_name = terminals_by_name

self.allowed = allowed
self.considered_tokens = considered_tokens
self.token_history = token_history

if isinstance(seq, bytes):
self._s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace")
else:
self._s = seq[lex_pos]
self.char = seq[lex_pos]
self._context = self.get_context(seq)
super(UnexpectedCharacters, self).__init__()

def __str__(self):
# Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors
# You will get just `UnexpectedCharacters: <str() failed>` or something like that
# If you run into this, add an `except Exception as e: print(e); raise e` or similar.
message = "No terminal defined for '%s' at line %d col %d" % (self._s, self.line, self.column)
message = "No terminal defined for '%s' at line %d col %d" % (self.char, self.line, self.column)
message += '\n\n' + self._context
if self.allowed:
message += self._format_terminals(self.allowed)
message += self._format_expected(self.allowed)
if self.token_history:
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history)
return message

_not_set_marker = object()

class UnexpectedToken(ParseError, UnexpectedInput):
"""When the parser throws UnexpectedToken, it instantiates a puppet
with its internal state. Users can then interactively set the puppet to
@@ -188,7 +187,8 @@ class UnexpectedToken(ParseError, UnexpectedInput):

see: :ref:`ParserPuppet`.
"""
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, all_terminals=None, token_history=None):

def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, terminals_by_name=None, token_history=None):
# TODO considered_rules and expected can be figured out using state
self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?')
@@ -196,23 +196,24 @@ class UnexpectedToken(ParseError, UnexpectedInput):
self.state = state

self.token = token
self.expected = expected # XXX deprecate? `accepts` is better
self.expected = expected # XXX deprecate? `accepts` is better
self._accepts = _not_set_marker
self.considered_rules = considered_rules
self.puppet = puppet
self._all_terminals = all_terminals
self._terminals_by_name = terminals_by_name
self.token_history = token_history


super(UnexpectedToken, self).__init__()
@property
def accepts(self):
return self.puppet and self.puppet.accepts()
if self._accepts is _not_set_marker:
self._accepts = self.puppet and self.puppet.accepts()
return self._accepts

def __str__(self):
# Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors
message = ("Unexpected token %r at line %s, column %s.\n%s"
% (self.token, self.line, self.column, self._format_terminals(self.accepts or self.expected)))
% (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected)))
if self.token_history:
message += "Previous tokens: %r\n" % self.token_history

@@ -226,6 +227,7 @@ class VisitError(LarkError):
- obj: the tree node or token it was processing when the exception was raised
- orig_exc: the exception that cause it to fail
"""

def __init__(self, rule, obj, orig_exc):
self.obj = obj
self.orig_exc = orig_exc
@@ -233,5 +235,4 @@ class VisitError(LarkError):
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message)


###}

+ 18
- 10
lark/lexer.py Näytä tiedosto

@@ -81,18 +81,23 @@ class PatternRE(Pattern):


class TerminalDef(Serialize):
__serialize_fields__ = 'name', 'pattern', 'priority', 'user_repr'
__serialize_fields__ = 'name', 'pattern', 'priority'
__serialize_namespace__ = PatternStr, PatternRE

def __init__(self, name, pattern, priority=1, user_repr=None):
def __init__(self, name, pattern, priority=1):
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern
self.priority = priority
self.user_repr = user_repr or name

def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
def user_repr(self):
if self.name.startswith('__'): # We represent a generated terminal
return self.pattern.raw or self.name
else:
return self.name


class Token(Str):
@@ -312,6 +317,7 @@ class TraditionalLexer(Lexer):
self.user_callbacks = conf.callbacks
self.g_regex_flags = conf.g_regex_flags
self.use_bytes = conf.use_bytes
self.terminals_by_names = conf.terminals_by_names

self._mres = None

@@ -355,7 +361,7 @@ class TraditionalLexer(Lexer):
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
state=parser_state, _all_terminals=self.terminals)
state=parser_state, terminals_by_name=self.terminals_by_names)

value, type_ = res

@@ -397,10 +403,7 @@ class ContextualLexer(Lexer):

def __init__(self, conf, states, always_accept=()):
terminals = list(conf.terminals)
tokens_by_name = {}
for t in terminals:
assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t
tokens_by_name = conf.terminals_by_names

trad_conf = copy(conf)
trad_conf.terminals = terminals
@@ -437,8 +440,13 @@ class ContextualLexer(Lexer):
except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
# This tests the input against the global context, to provide a nicer error.
token = self.root_lexer.next_token(lexer_state)
raise UnexpectedToken(token, e.allowed, state=parser_state.position, token_history=[lexer_state.last_token], all_terminals=self.root_lexer.terminals)
last_token = lexer_state.last_token # self.root_lexer.next_token will change this to the wrong token
try:
token = self.root_lexer.next_token(lexer_state, parser_state)
except UnexpectedCharacters:
raise e# Don't raise the exception that the root lexer raise. It has the wrong expected set.
else:
raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_names)

class LexerThread:
"""A thread that ties a lexer instance and a lexer state, to be used by the parser"""


+ 1
- 2
lark/load_grammar.py Näytä tiedosto

@@ -322,7 +322,6 @@ class PrepareAnonTerminals(Transformer_InPlace):
raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)

term_name = None
user_repr = p.raw # This will always be ok, independent of what term_name we end up using

if isinstance(p, PatternStr):
try:
@@ -354,7 +353,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
if term_name not in self.term_set:
assert p not in self.term_reverse
self.term_set.add(term_name)
termdef = TerminalDef(term_name, p, user_repr=user_repr)
termdef = TerminalDef(term_name, p)
self.term_reverse[p] = termdef
self.terminals.append(termdef)



+ 1
- 1
lark/parsers/earley.py Näytä tiedosto

@@ -302,7 +302,7 @@ class Parser:
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
if not solutions:
expected_terminals = [t.expect for t in to_scan]
expected_terminals = [t.expect.name for t in to_scan]
raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))

if self.debug:


Ladataan…
Peruuta
Tallenna