浏览代码

Merge branch 'MegaIng-better-terminals'

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Sh 3 年前
父节点
当前提交
36a7b050c1
共有 10 个文件被更改,包括 102 次插入54 次删除
  1. +3
    -0
      lark-stubs/lexer.pyi
  2. +16
    -4
      lark/common.py
  3. +52
    -27
      lark/exceptions.py
  4. +1
    -1
      lark/lark.py
  5. +22
    -15
      lark/lexer.py
  6. +2
    -2
      lark/load_grammar.py
  7. +1
    -1
      lark/parser_frontends.py
  8. +1
    -1
      lark/parsers/earley.py
  9. +3
    -3
      lark/parsers/lalr_parser.py
  10. +1
    -0
      lark/utils.py

+ 3
- 0
lark-stubs/lexer.pyi 查看文件

@@ -12,6 +12,7 @@ _T = TypeVar('_T')
class Pattern(ABC):
value: str
flags: Collection[str]
raw: str

def __init__(self, value: str, flags: Collection[str] = ...):
...
@@ -73,6 +74,8 @@ class TerminalDef:

def __init__(self, name: str, pattern: Pattern, priority: int = ...):
...
def user_repr(self) -> str: ...


class Token(str):


+ 16
- 4
lark/common.py 查看文件

@@ -1,3 +1,5 @@
from warnings import warn

from .utils import Serialize
from .lexer import TerminalDef

@@ -5,11 +7,13 @@ from .lexer import TerminalDef


class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type'
__serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type'
__serialize_namespace__ = TerminalDef,

def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
self.tokens = tokens # TODO should be terminals
def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
self.terminals = terminals
self.terminals_by_name = {t.name: t for t in self.terminals}
assert len(self.terminals) == len(self.terminals_by_name)
self.ignore = ignore
self.postlex = postlex
self.callbacks = callbacks or {}
@@ -17,9 +21,17 @@ class LexerConf(Serialize):
self.re_module = re_module
self.skip_validation = skip_validation
self.use_bytes = use_bytes

self.lexer_type = None

@property
def tokens(self):
warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning)
return self.terminals

def _deserialize(self):
self.terminals_by_name = {t.name: t for t in self.terminals}



class ParserConf(Serialize):
__serialize_fields__ = 'rules', 'start', 'parser_type'


+ 52
- 27
lark/exceptions.py 查看文件

@@ -1,4 +1,5 @@
from .utils import STRING_TYPE, logger
from .utils import STRING_TYPE, logger, NO_VALUE


###{standalone

@@ -39,6 +40,7 @@ class UnexpectedInput(LarkError):
After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
"""
pos_in_stream = None
_terminals_by_name = None

def get_context(self, text, span=40):
"""Returns a pretty string pinpointing the error in the text,
@@ -95,7 +97,7 @@ class UnexpectedInput(LarkError):
if ut.state == self.state:
if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts:
logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
(self.state, self.accepts, ut.accepts, i, j))
(self.state, self.accepts, ut.accepts, i, j))
continue
try:
if ut.token == self.token: # Try exact match first
@@ -116,44 +118,61 @@ class UnexpectedInput(LarkError):

return candidate[0]

def _format_expected(self, expected):
if self._terminals_by_name:
expected = [self._terminals_by_name[t_name].user_repr() for t_name in expected]
return "Expected one of: \n\t* %s\n" % '\n\t* '.join(expected)


class UnexpectedEOF(ParseError, UnexpectedInput):
def __init__(self, expected, state=None):
def __init__(self, expected, state=None, terminals_by_name=None):
self.expected = expected
self.state = state
from .lexer import Token
self.token = Token("<EOF>", "") #, line=-1, column=-1, pos_in_stream=-1)
self.token = Token("<EOF>", "") # , line=-1, column=-1, pos_in_stream=-1)
self.pos_in_stream = -1
self.line = -1
self.column = -1
self._terminals_by_name = terminals_by_name

message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
super(UnexpectedEOF, self).__init__(message)
super(UnexpectedEOF, self).__init__()

def __str__(self):
message = "Unexpected end-of-input. "
message += self._format_expected(self.expected)
return message


class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
terminals_by_name=None):
# TODO considered_tokens and allowed can be figured out using state
self.line = line
self.column = column
self.pos_in_stream = lex_pos
self.state = state
self._terminals_by_name = terminals_by_name

self.allowed = allowed
self.considered_tokens = considered_tokens
self.token_history = token_history

if isinstance(seq, bytes):
_s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace")
else:
_s = seq[lex_pos]
self.char = seq[lex_pos]
self._context = self.get_context(seq)

message = "No terminal defined for %r at line %d col %d" % (_s, line, column)
message += '\n\n' + self.get_context(seq)
if allowed:
message += '\nExpecting: %s\n' % allowed
if token_history:
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history)
super(UnexpectedCharacters, self).__init__()

super(UnexpectedCharacters, self).__init__(message)
def __str__(self):
message = "No terminal defined for '%s' at line %d col %d" % (self.char, self.line, self.column)
message += '\n\n' + self._context
if self.allowed:
message += self._format_expected(self.allowed)
if self.token_history:
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history)
return message


class UnexpectedToken(ParseError, UnexpectedInput):
@@ -163,7 +182,8 @@ class UnexpectedToken(ParseError, UnexpectedInput):

see: :ref:`ParserPuppet`.
"""
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None):

def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, terminals_by_name=None, token_history=None):
# TODO considered_rules and expected can be figured out using state
self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?')
@@ -171,23 +191,28 @@ class UnexpectedToken(ParseError, UnexpectedInput):
self.state = state

self.token = token
self.expected = expected # XXX deprecate? `accepts` is better
self.expected = expected # XXX deprecate? `accepts` is better
self._accepts = NO_VALUE
self.considered_rules = considered_rules
self.puppet = puppet
self._terminals_by_name = terminals_by_name
self.token_history = token_history

# TODO Only calculate `accepts()` when we need to display it to the user
# This will improve performance when doing automatic error handling
self.accepts = puppet and puppet.accepts()
super(UnexpectedToken, self).__init__()

message = ("Unexpected token %r at line %s, column %s.\n"
"Expected one of: \n\t* %s\n"
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))
@property
def accepts(self):
if self._accepts is NO_VALUE:
self._accepts = self.puppet and self.puppet.accepts()
return self._accepts

def __str__(self):
message = ("Unexpected token %r at line %s, column %s.\n%s"
% (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected)))
if self.token_history:
message += "Previous tokens: %r\n" % token_history
message += "Previous tokens: %r\n" % self.token_history

super(UnexpectedToken, self).__init__(message)
return message


class VisitError(LarkError):
@@ -197,6 +222,7 @@ class VisitError(LarkError):
- obj: the tree node or token it was processing when the exception was raised
- orig_exc: the exception that cause it to fail
"""

def __init__(self, rule, obj, orig_exc):
self.obj = obj
self.orig_exc = orig_exc
@@ -204,5 +230,4 @@ class VisitError(LarkError):
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message)


###}

+ 1
- 1
lark/lark.py 查看文件

@@ -416,7 +416,7 @@ class Lark(Serialize):
self._callbacks,
self.options, # Not all, but multiple attributes are used
)
self.terminals = self.parser.lexer_conf.tokens
self.terminals = self.parser.lexer_conf.terminals
self._terminals_dict = {t.name: t for t in self.terminals}
return self



+ 22
- 15
lark/lexer.py 查看文件

@@ -11,9 +11,10 @@ from copy import copy

class Pattern(Serialize):

def __init__(self, value, flags=()):
def __init__(self, value, flags=(), raw=None):
self.value = value
self.flags = frozenset(flags)
self.raw = raw

def __repr__(self):
return repr(self.to_regexp())
@@ -92,6 +93,12 @@ class TerminalDef(Serialize):
def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)

def user_repr(self):
if self.name.startswith('__'): # We represent a generated terminal
return self.pattern.raw or self.name
else:
return self.name


class Token(Str):
"""A string with meta-information, that is produced by the lexer.
@@ -283,7 +290,7 @@ class Lexer(object):
class TraditionalLexer(Lexer):

def __init__(self, conf):
terminals = list(conf.tokens)
terminals = list(conf.terminals)
assert all(isinstance(t, TerminalDef) for t in terminals), terminals

self.re = conf.re_module
@@ -310,6 +317,7 @@ class TraditionalLexer(Lexer):
self.user_callbacks = conf.callbacks
self.g_regex_flags = conf.g_regex_flags
self.use_bytes = conf.use_bytes
self.terminals_by_name = conf.terminals_by_name

self._mres = None

@@ -353,7 +361,7 @@ class TraditionalLexer(Lexer):
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
state=parser_state)
state=parser_state, terminals_by_name=self.terminals_by_name)

value, type_ = res

@@ -394,14 +402,11 @@ class LexerState:
class ContextualLexer(Lexer):

def __init__(self, conf, states, always_accept=()):
terminals = list(conf.tokens)
tokens_by_name = {}
for t in terminals:
assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t
terminals = list(conf.terminals)
terminals_by_name = conf.terminals_by_name

trad_conf = copy(conf)
trad_conf.tokens = terminals
trad_conf.terminals = terminals

lexer_by_tokens = {}
self.lexers = {}
@@ -411,15 +416,14 @@ class ContextualLexer(Lexer):
lexer = lexer_by_tokens[key]
except KeyError:
accepts = set(accepts) | set(conf.ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer_conf = copy(trad_conf)
lexer_conf.tokens = state_tokens
lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name]
lexer = TraditionalLexer(lexer_conf)
lexer_by_tokens[key] = lexer

self.lexers[state] = lexer

assert trad_conf.tokens is terminals
assert trad_conf.terminals is terminals
self.root_lexer = TraditionalLexer(trad_conf)

def make_lexer_state(self, text):
@@ -435,9 +439,12 @@ class ContextualLexer(Lexer):
except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
# This tests the input against the global context, to provide a nicer error.
token = self.root_lexer.next_token(lexer_state, parser_state)
raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token])

try:
last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token
token = self.root_lexer.next_token(lexer_state, parser_state)
raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name)
except UnexpectedCharacters:
raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set.

class LexerThread:
"""A thread that ties a lexer instance and a lexer state, to be used by the parser"""


+ 2
- 2
lark/load_grammar.py 查看文件

@@ -454,9 +454,9 @@ def _literal_to_pattern(literal):

if literal.type == 'STRING':
s = s.replace('\\\\', '\\')
return PatternStr(s, flags)
return PatternStr(s, flags, raw=literal.value)
elif literal.type == 'REGEXP':
return PatternRE(s, flags)
return PatternRE(s, flags, raw=literal.value)
else:
assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]'



+ 1
- 1
lark/parser_frontends.py 查看文件

@@ -170,7 +170,7 @@ CYK_FrontEnd = NotImplemented
class EarleyRegexpMatcher:
def __init__(self, lexer_conf):
self.regexps = {}
for t in lexer_conf.tokens:
for t in lexer_conf.terminals:
if t.priority != 1:
raise GrammarError("Dynamic Earley doesn't support weights on terminals", t, t.priority)
regexp = t.pattern.to_regexp()


+ 1
- 1
lark/parsers/earley.py 查看文件

@@ -302,7 +302,7 @@ class Parser:
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
if not solutions:
expected_terminals = [t.expect for t in to_scan]
expected_terminals = [t.expect.name for t in to_scan]
raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))

if self.debug:


+ 3
- 3
lark/parsers/lalr_parser.py 查看文件

@@ -36,7 +36,7 @@ class LALR_Parser(Serialize):
return self.parser.parse(*args)


class ParseConf:
class ParseConf(object):
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'

def __init__(self, parse_table, callbacks, start):
@@ -50,7 +50,7 @@ class ParseConf:
self.start = start


class ParserState:
class ParserState(object):
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'

def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None):
@@ -124,7 +124,7 @@ class ParserState:
if is_end and state_stack[-1] == end_state:
return value_stack[-1]

class _Parser:
class _Parser(object):
def __init__(self, parse_table, callbacks, debug=False):
self.parse_table = parse_table
self.callbacks = callbacks


+ 1
- 0
lark/utils.py 查看文件

@@ -13,6 +13,7 @@ logger.setLevel(logging.CRITICAL)

Py36 = (sys.version_info[:2] >= (3, 6))

NO_VALUE = object()

def classify(seq, key=None, value=None):
d = {}


正在加载...
取消
保存