@@ -0,0 +1,79 @@ | |||
""" | |||
Example-Driven Error Reporting | |||
============================== | |||
A demonstration of example-driven error reporting with the Earley parser | |||
(See also: error_reporting_lalr.py) | |||
""" | |||
from lark import Lark, UnexpectedInput | |||
from _json_parser import json_grammar # Using the grammar from the json_parser example | |||
json_parser = Lark(json_grammar) | |||
class JsonSyntaxError(SyntaxError): | |||
def __str__(self): | |||
context, line, column = self.args | |||
return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context) | |||
class JsonMissingValue(JsonSyntaxError): | |||
label = 'Missing Value' | |||
class JsonMissingOpening(JsonSyntaxError): | |||
label = 'Missing Opening' | |||
class JsonMissingClosing(JsonSyntaxError): | |||
label = 'Missing Closing' | |||
class JsonMissingComma(JsonSyntaxError): | |||
label = 'Missing Comma' | |||
class JsonTrailingComma(JsonSyntaxError): | |||
label = 'Trailing Comma' | |||
def parse(json_text): | |||
try: | |||
j = json_parser.parse(json_text) | |||
except UnexpectedInput as u: | |||
exc_class = u.match_examples(json_parser.parse, { | |||
JsonMissingOpening: ['{"foo": ]}', | |||
'{"foor": }}', | |||
'{"foo": }'], | |||
JsonMissingClosing: ['{"foo": [}', | |||
'{', | |||
'{"a": 1', | |||
'[1'], | |||
JsonMissingComma: ['[1 2]', | |||
'[false 1]', | |||
'["b" 1]', | |||
'{"a":true 1:4}', | |||
'{"a":1 1:4}', | |||
'{"a":"b" 1:4}'], | |||
JsonTrailingComma: ['[,]', | |||
'[1,]', | |||
'[1,2,]', | |||
'{"foo":1,}', | |||
'{"foo":false,"bar":true,}'] | |||
}, use_accepts=True) | |||
if not exc_class: | |||
raise | |||
raise exc_class(u.get_context(json_text), u.line, u.column) | |||
def test(): | |||
try: | |||
parse('{"example1": "value"') | |||
except JsonMissingClosing as e: | |||
print(e) | |||
try: | |||
parse('{"example2": ] ') | |||
except JsonMissingOpening as e: | |||
print(e) | |||
if __name__ == '__main__': | |||
test() | |||
@@ -3,7 +3,7 @@ Example-Driven Error Reporting | |||
============================== | |||
A demonstration of example-driven error reporting with the LALR parser | |||
(See also: error_reporting_earley.py) | |||
""" | |||
from lark import Lark, UnexpectedInput | |||
@@ -3,7 +3,7 @@ from .tree import Tree | |||
from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive | |||
from .visitors import InlineTransformer, inline_args # XXX Deprecated | |||
from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, | |||
UnexpectedInput, UnexpectedCharacters, LarkError) | |||
UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError) | |||
from .lexer import Token | |||
from .lark import Lark | |||
@@ -19,14 +19,6 @@ class LexError(LarkError): | |||
pass | |||
class UnexpectedEOF(ParseError): | |||
def __init__(self, expected): | |||
self.expected = expected | |||
message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) | |||
super(UnexpectedEOF, self).__init__(message) | |||
class UnexpectedInput(LarkError): | |||
"""UnexpectedInput Error. | |||
@@ -47,6 +39,7 @@ class UnexpectedInput(LarkError): | |||
The parser doesn't hold a copy of the text it has to parse, | |||
so you have to provide it again | |||
""" | |||
assert self.pos_in_stream is not None, self | |||
pos = self.pos_in_stream | |||
start = max(pos - span, 0) | |||
end = pos + span | |||
@@ -91,7 +84,7 @@ class UnexpectedInput(LarkError): | |||
parse_fn(malformed) | |||
except UnexpectedInput as ut: | |||
if ut.state == self.state: | |||
if use_accepts and ut.accepts != self.accepts: | |||
if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: | |||
logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % | |||
(self.state, self.accepts, ut.accepts, i, j)) | |||
continue | |||
@@ -108,15 +101,29 @@ class UnexpectedInput(LarkError): | |||
except AttributeError: | |||
pass | |||
if not candidate[0]: | |||
if candidate[0] is None: | |||
logger.debug("Same State match at example [%s][%s]" % (i, j)) | |||
candidate = label, False | |||
return candidate[0] | |||
class UnexpectedEOF(ParseError, UnexpectedInput): | |||
def __init__(self, expected, state=None): | |||
self.expected = expected | |||
self.state = state | |||
from .lexer import Token | |||
self.token = Token("<EOF>", "") #, line=-1, column=-1, pos_in_stream=-1) | |||
self.pos_in_stream = -1 | |||
self.line = -1 | |||
self.column = -1 | |||
message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) | |||
super(UnexpectedEOF, self).__init__(message) | |||
class UnexpectedCharacters(LexError, UnexpectedInput): | |||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||
# TODO considered_tokens and allowed can be figured out using state | |||
self.line = line | |||
self.column = column | |||
self.pos_in_stream = lex_pos | |||
@@ -147,7 +154,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
see: :ref:`ParserPuppet`. | |||
""" | |||
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): | |||
def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None): | |||
# TODO considered_rules and expected can be figured out using state | |||
self.line = getattr(token, 'line', '?') | |||
self.column = getattr(token, 'column', '?') | |||
self.pos_in_stream = getattr(token, 'pos_in_stream', None) | |||
@@ -157,6 +165,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
self.expected = expected # XXX deprecate? `accepts` is better | |||
self.considered_rules = considered_rules | |||
self.puppet = puppet | |||
self.token_history = token_history | |||
# TODO Only calculate `accepts()` when we need to display it to the user | |||
# This will improve performance when doing automatic error handling | |||
@@ -166,6 +175,9 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
"Expected one of: \n\t* %s\n" | |||
% (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||
if self.token_history: | |||
message += "Previous tokens: %r\n" % token_history | |||
super(UnexpectedToken, self).__init__(message) | |||
@@ -353,7 +353,7 @@ class TraditionalLexer(Lexer): | |||
allowed = {"<END-OF-FILE>"} | |||
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | |||
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], | |||
state=(parser_state and parser_state.position)) | |||
state=parser_state) | |||
value, type_ = res | |||
@@ -436,7 +436,7 @@ class ContextualLexer(Lexer): | |||
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | |||
# This tests the input against the global context, to provide a nicer error. | |||
token = self.root_lexer.next_token(lexer_state, parser_state) | |||
raise UnexpectedToken(token, e.allowed, state=parser_state.position) | |||
raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token]) | |||
class LexerThread: | |||
@@ -173,9 +173,6 @@ class Earley(WithLexer): | |||
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None | |||
self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) | |||
def make_lexer(self, text): | |||
return WithLexer.make_lexer(self, text).lex(None) | |||
def match(self, term, token): | |||
return term.name == token.type | |||
@@ -146,7 +146,7 @@ class Parser: | |||
column.add(new_item) | |||
items.append(new_item) | |||
def _parse(self, stream, columns, to_scan, start_symbol=None): | |||
def _parse(self, lexer, columns, to_scan, start_symbol=None): | |||
def is_quasi_complete(item): | |||
if item.is_complete: | |||
return True | |||
@@ -245,7 +245,7 @@ class Parser: | |||
if not next_set and not next_to_scan: | |||
expect = {i.expect.name for i in to_scan} | |||
raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) | |||
raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.expect for i in to_scan)) | |||
return next_to_scan | |||
@@ -261,20 +261,24 @@ class Parser: | |||
# Completions will be added to the SPPF tree, and predictions will be recursively | |||
# processed down to terminals/empty nodes to be added to the scanner for the next | |||
# step. | |||
expects = {i.expect for i in to_scan} | |||
i = 0 | |||
for token in stream: | |||
for token in lexer.lex(expects): | |||
self.predict_and_complete(i, to_scan, columns, transitives) | |||
to_scan = scan(i, token, to_scan) | |||
i += 1 | |||
expects.clear() | |||
expects |= {i.expect for i in to_scan} | |||
self.predict_and_complete(i, to_scan, columns, transitives) | |||
## Column is now the final column in the parse. | |||
assert i == len(columns)-1 | |||
return to_scan | |||
def parse(self, stream, start): | |||
def parse(self, lexer, start): | |||
assert start, start | |||
start_symbol = NonTerminal(start) | |||
@@ -291,7 +295,7 @@ class Parser: | |||
else: | |||
columns[0].add(item) | |||
to_scan = self._parse(stream, columns, to_scan, start_symbol) | |||
to_scan = self._parse(lexer, columns, to_scan, start_symbol) | |||
# If the parse was successful, the start | |||
# symbol should have been completed in the last step of the Earley cycle, and will be in | |||
@@ -299,7 +303,7 @@ class Parser: | |||
solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | |||
if not solutions: | |||
expected_terminals = [t.expect for t in to_scan] | |||
raise UnexpectedEOF(expected_terminals) | |||
raise UnexpectedEOF(expected_terminals, state=frozenset(i.expect for i in to_scan)) | |||
if self.debug: | |||
from .earley_forest import ForestToPyDotVisitor | |||
@@ -3,7 +3,7 @@ | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
from copy import deepcopy, copy | |||
from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||
from ..exceptions import UnexpectedInput, UnexpectedToken | |||
from ..lexer import Token | |||
from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | |||
@@ -62,6 +62,12 @@ class ParserState: | |||
def position(self): | |||
return self.state_stack[-1] | |||
# Necessary for match_examples() to work | |||
def __eq__(self, other): | |||
if not isinstance(other, ParserState): | |||
return False | |||
return self.position == other.position | |||
def __copy__(self): | |||
return type(self)( | |||
self.parse_conf, | |||
@@ -86,7 +92,7 @@ class ParserState: | |||
action, arg = states[state][token.type] | |||
except KeyError: | |||
expected = {s for s in states[state].keys() if s.isupper()} | |||
raise UnexpectedToken(token, expected, state=state, puppet=None) | |||
raise UnexpectedToken(token, expected, state=self, puppet=None) | |||
assert arg != end_state | |||
@@ -113,7 +113,8 @@ class Parser(BaseParser): | |||
del delayed_matches[i+1] # No longer needed, so unburden memory | |||
if not next_set and not delayed_matches and not next_to_scan: | |||
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan)) | |||
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, | |||
set(to_scan), state=frozenset(i.expect for i in to_scan)) | |||
return next_to_scan | |||
@@ -69,6 +69,14 @@ def parse_rulename(s): | |||
return name, args | |||
class ChildrenLexer: | |||
def __init__(self, children): | |||
self.children = children | |||
def lex(self, parser_state): | |||
return self.children | |||
class TreeMatcher: | |||
"""Match the elements of a tree node, based on an ontology | |||
provided by a Lark grammar. | |||
@@ -173,6 +181,6 @@ class TreeMatcher: | |||
self._parser_cache[rulename] = parser | |||
# find a full derivation | |||
unreduced_tree = parser.parse(tree.children, rulename) | |||
unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename) | |||
assert unreduced_tree.data == rulename | |||
return unreduced_tree |
@@ -323,7 +323,7 @@ class TestParsers(unittest.TestCase): | |||
def test_alias(self): | |||
Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """) | |||
def test_backwards_custom_lexer(self): | |||
class OldCustomLexer(Lexer): | |||
def __init__(self, lexer_conf): | |||
@@ -331,12 +331,12 @@ class TestParsers(unittest.TestCase): | |||
def lex(self, text): | |||
yield Token('A', 'A') | |||
p = Lark(""" | |||
start: A | |||
%declare A | |||
""", parser='lalr', lexer=OldCustomLexer) | |||
r = p.parse('') | |||
self.assertEqual(r, Tree('start', [Token('A', 'A')])) | |||
@@ -866,7 +866,7 @@ class CustomLexer(Lexer): | |||
self.lexer = TraditionalLexer(copy(lexer_conf)) | |||
def lex(self, *args, **kwargs): | |||
return self.lexer.lex(*args, **kwargs) | |||
__future_interface__ = True | |||
def _tree_structure_check(a, b): | |||
@@ -2342,6 +2342,30 @@ def _make_parser_test(LEXER, PARSER): | |||
self.assertEqual(a.line, 1) | |||
self.assertEqual(b.line, 2) | |||
@unittest.skipIf(PARSER=='cyk', "match_examples() not supported for CYK") | |||
def test_match_examples(self): | |||
p = _Lark(r""" | |||
start: "a" "b" "c" | |||
""") | |||
def match_error(s): | |||
try: | |||
_ = p.parse(s) | |||
except UnexpectedInput as u: | |||
return u.match_examples(p.parse, { | |||
0: ['abe'], | |||
1: ['ab'], | |||
2: ['cbc'], | |||
}) | |||
assert False | |||
assert match_error("abe") == 0 | |||
assert match_error("ab") == 1 | |||
assert match_error("bbc") == 2 | |||
assert match_error("cbc") == 2 | |||
self.assertEqual( match_error("dbc"), 2 ) | |||
@unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||
def test_unicode_class(self): | |||
"Tests that character classes from the `regex` module work correctly." | |||