Browse Source

improve error message with token source.

rename LexerConf.terminals to LexerConf.tokens
Make Exception message generation lazy
Made a few classes new-style
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
MegaIng1 4 years ago
parent
commit
605b91e4be
8 changed files with 100 additions and 51 deletions
  1. +10
    -3
      lark/common.py
  2. +51
    -24
      lark/exceptions.py
  3. +1
    -1
      lark/lark.py
  4. +11
    -9
      lark/lexer.py
  5. +8
    -6
      lark/load_grammar.py
  6. +15
    -4
      lark/parser_frontends.py
  7. +3
    -3
      lark/parsers/lalr_parser.py
  8. +1
    -1
      lark/parsers/lalr_puppet.py

+ 10
- 3
lark/common.py View File

@@ -1,14 +1,16 @@
from warnings import warn

from .utils import Serialize
from .lexer import TerminalDef

###{standalone

class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes'
__serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes'
__serialize_namespace__ = TerminalDef,

def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
self.tokens = tokens # TODO should be terminals
def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
self.terminals = terminals
self.ignore = ignore
self.postlex = postlex
self.callbacks = callbacks or {}
@@ -16,6 +18,11 @@ class LexerConf(Serialize):
self.re_module = re_module
self.skip_validation = skip_validation
self.use_bytes = use_bytes
@property
def tokens(self):
warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning)
return self.terminals

###}



+ 51
- 24
lark/exceptions.py View File

@@ -34,6 +34,7 @@ class UnexpectedInput(LarkError):
After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
"""
pos_in_stream = None
_all_terminals = None

def get_context(self, text, span=40):
"""Returns a pretty string pinpointing the error in the text,
@@ -109,32 +110,54 @@ class UnexpectedInput(LarkError):
candidate = label, False

return candidate[0]
def _format_terminals(self, names):
if self._all_terminals:
t = []
for name in names:
try:
t.append(next(t.nice_print for t in self._all_terminals if t.name == name))
except StopIteration:
# If we don't find the corresponding Terminal (which *should* never happen), don't error.
# Broken __str__ for Exception are some of the worst bugs
t.append(t.display_name)
else:
t = names
return "Expected one of: \n\t* %s\n" % '\n\t* '.join(t)



class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, _all_terminals=None):
self.line = line
self.column = column
self.pos_in_stream = lex_pos
self.state = state
self._all_terminals = _all_terminals

self.allowed = allowed
self.considered_tokens = considered_tokens
self.token_history = token_history

if isinstance(seq, bytes):
_s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
self._s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
else:
_s = seq[lex_pos]

message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column)
message += '\n\n' + self.get_context(seq)
if allowed:
message += '\nExpecting: %s\n' % allowed
if token_history:
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history)

super(UnexpectedCharacters, self).__init__(message)

self._s = seq[lex_pos]
self._context = self.get_context(seq)
super(UnexpectedCharacters, self).__init__()

def __str__(self):
# Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors
# You will get just `UnexpectedCharacters: <str() failed>` or something like that
# If you run into this, add an `except Exception as e: print(e); raise e` or similar.
message = "No terminal defined for '%s' at line %d col %d" % (self._s, self.line, self.column)
message += '\n\n' + self._context
if self.allowed:
message += self._format_terminals(self.allowed)
if self.token_history:
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history)
return message

class UnexpectedToken(ParseError, UnexpectedInput):
"""When the parser throws UnexpectedToken, it instantiates a puppet
@@ -143,7 +166,7 @@ class UnexpectedToken(ParseError, UnexpectedInput):

see: :ref:`ParserPuppet`.
"""
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None):
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, all_terminals=None):
self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?')
self.pos_in_stream = getattr(token, 'pos_in_stream', None)
@@ -153,16 +176,20 @@ class UnexpectedToken(ParseError, UnexpectedInput):
self.expected = expected # XXX deprecate? `accepts` is better
self.considered_rules = considered_rules
self.puppet = puppet

# TODO Only calculate `accepts()` when we need to display it to the user
# This will improve performance when doing automatic error handling
self.accepts = puppet and puppet.accepts()

message = ("Unexpected token %r at line %s, column %s.\n"
"Expected one of: \n\t* %s\n"
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))

super(UnexpectedToken, self).__init__(message)
self._all_terminals = all_terminals


super(UnexpectedToken, self).__init__()
@property
def accepts(self):
return self.puppet and self.puppet.accepts()
def __str__(self):
# Be aware: Broken __str__ for Exceptions are terrible to debug. Make sure there is as little room as possible for errors
message = ("Unexpected token %r at line %s, column %s.\n%s"
% (self.token, self.line, self.column, self._format_terminals(self.accepts or self.expected)))
return message


class VisitError(LarkError):


+ 1
- 1
lark/lark.py View File

@@ -402,7 +402,7 @@ class Lark(Serialize):
self._callbacks,
self.options, # Not all, but multiple attributes are used
)
self.terminals = self.parser.lexer_conf.tokens
self.terminals = self.parser.lexer_conf.terminals
self._terminals_dict = {t.name: t for t in self.terminals}
return self



+ 11
- 9
lark/lexer.py View File

@@ -76,14 +76,15 @@ class PatternRE(Pattern):


class TerminalDef(Serialize):
__serialize_fields__ = 'name', 'pattern', 'priority'
__serialize_fields__ = 'name', 'pattern', 'priority', 'nice_print'
__serialize_namespace__ = PatternStr, PatternRE

def __init__(self, name, pattern, priority=1):
def __init__(self, name, pattern, priority=1, nice_print=None):
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern
self.priority = priority
self.nice_print = nice_print or name

def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
@@ -278,7 +279,7 @@ class Lexer(object):
class TraditionalLexer(Lexer):

def __init__(self, conf):
terminals = list(conf.tokens)
terminals = list(conf.terminals)
assert all(isinstance(t, TerminalDef) for t in terminals), terminals

self.re = conf.re_module
@@ -347,7 +348,8 @@ class TraditionalLexer(Lexer):
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token])
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
_all_terminals=self.terminals)

value, type_ = res

@@ -386,14 +388,14 @@ class LexerState:
class ContextualLexer(Lexer):

def __init__(self, conf, states, always_accept=()):
terminals = list(conf.tokens)
terminals = list(conf.terminals)
tokens_by_name = {}
for t in terminals:
assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t

trad_conf = copy(conf)
trad_conf.tokens = terminals
trad_conf.terminals = terminals

lexer_by_tokens = {}
self.lexers = {}
@@ -405,13 +407,13 @@ class ContextualLexer(Lexer):
accepts = set(accepts) | set(conf.ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer_conf = copy(trad_conf)
lexer_conf.tokens = state_tokens
lexer_conf.terminals = state_tokens
lexer = TraditionalLexer(lexer_conf)
lexer_by_tokens[key] = lexer

self.lexers[state] = lexer

assert trad_conf.tokens is terminals
assert trad_conf.terminals is terminals
self.root_lexer = TraditionalLexer(trad_conf)

def make_lexer_state(self, text):
@@ -428,7 +430,7 @@ class ContextualLexer(Lexer):
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
# This tests the input against the global context, to provide a nicer error.
token = self.root_lexer.next_token(lexer_state)
raise UnexpectedToken(token, e.allowed, state=parser_state.position)
raise UnexpectedToken(token, e.allowed, state=parser_state.position, all_terminals=self.root_lexer.terminals)

class LexerThread:
"A thread that ties a lexer instance and a lexer state, to be used by the parser"


+ 8
- 6
lark/load_grammar.py View File

@@ -5,7 +5,7 @@ import sys
from copy import copy, deepcopy
from io import open

from .utils import bfs, eval_escaping, Py36, logger, classify_bool
from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isascii
from .lexer import Token, TerminalDef, PatternStr, PatternRE

from .parse_tree_builder import ParseTreeBuilder
@@ -317,8 +317,11 @@ class PrepareAnonTerminals(Transformer_InPlace):
raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)

term_name = None
nice_print = None

if isinstance(p, PatternStr):
nice_print = repr(value) # This will always be ok, independent of what term_name we end up using
# TODO: potentially try to get the actual source code, and not the repr
try:
# If already defined, use the user-defined terminal name
term_name = self.term_reverse[p].name
@@ -327,15 +330,14 @@ class PrepareAnonTerminals(Transformer_InPlace):
try:
term_name = _TERMINAL_NAMES[value]
except KeyError:
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
with suppress(UnicodeEncodeError):
value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names
term_name = value.upper()
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set and isascii(value):
term_name = value.upper()

if term_name in self.term_set:
term_name = None

elif isinstance(p, PatternRE):
#TODO: generate nice_print
if p in self.term_reverse: # Kind of a weird placement.name
term_name = self.term_reverse[p].name
else:
@@ -348,7 +350,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
if term_name not in self.term_set:
assert p not in self.term_reverse
self.term_set.add(term_name)
termdef = TerminalDef(term_name, p)
termdef = TerminalDef(term_name, p, nice_print=nice_print)
self.term_reverse[p] = termdef
self.terminals.append(termdef)



+ 15
- 4
lark/parser_frontends.py View File

@@ -6,6 +6,7 @@ from .parsers.lalr_parser import LALR_Parser
from .grammar import Rule
from .tree import Tree
from .common import LexerConf
from .exceptions import UnexpectedInput
try:
import regex
except ImportError:
@@ -135,7 +136,12 @@ class WithLexer(_ParserFrontend):
return LexerThread(lexer, text)

def parse(self, text, start=None):
return self._parse(start, self.make_lexer(text))
try:
return self._parse(start, self.make_lexer(text))
except UnexpectedInput as e:
if e._all_terminals is None:
e._all_terminals = self.lexer_conf.terminals
raise e

def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf)
@@ -190,7 +196,7 @@ class Earley(WithLexer):

class XEarley(_ParserFrontend):
def __init__(self, lexer_conf, parser_conf, options=None, **kw):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self.token_by_name = {t.name:t for t in lexer_conf.terminals}
self.start = parser_conf.start

self._prepare_match(lexer_conf)
@@ -211,7 +217,7 @@ class XEarley(_ParserFrontend):

def _prepare_match(self, lexer_conf):
self.regexps = {}
for t in lexer_conf.tokens:
for t in lexer_conf.terminals:
if t.priority != 1:
raise ValueError("Dynamic Earley doesn't support weights on terminals", t, t.priority)
regexp = t.pattern.to_regexp()
@@ -228,7 +234,12 @@ class XEarley(_ParserFrontend):
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)

def parse(self, text, start):
return self._parse(start, text)
try:
return self._parse(start, text)
except UnexpectedInput as e:
if e._all_terminals is None:
e._all_terminals = self.token_by_name.values()
raise e

class XEarley_CompleteLex(XEarley):
def __init__(self, *args, **kw):


+ 3
- 3
lark/parsers/lalr_parser.py View File

@@ -35,7 +35,7 @@ class LALR_Parser(object):
return self.parser.parse(*args)


class ParseConf:
class ParseConf(object):
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'

def __init__(self, parse_table, callbacks, start):
@@ -49,7 +49,7 @@ class ParseConf:
self.start = start


class ParserState:
class ParserState(object):
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'

def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None):
@@ -117,7 +117,7 @@ class ParserState:
if is_end and state_stack[-1] == end_state:
return value_stack[-1]

class _Parser:
class _Parser(object):
def __init__(self, parse_table, callbacks, debug=False):
self.parse_table = parse_table
self.callbacks = callbacks


+ 1
- 1
lark/parsers/lalr_puppet.py View File

@@ -60,7 +60,7 @@ class ParserPuppet(object):

Updated by ``feed_token()``.
"""
return self.parser_state.parse_table.states[self.parser_state.position]
return self.parser_state.parse_conf.parse_table.states[self.parser_state.position]

def accepts(self):
accepts = set()


Loading…
Cancel
Save