- Merging updated upstream into branch for file extension changes. - Will push so Pull Request has no remaining conflicts. - Also will change the file type of lark example grammar.tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.0
@@ -165,3 +165,5 @@ If you're interested in taking one of these on, let me know and I will provide m | |||
If you have any questions or want my assistance, you can email me at erezshin at gmail com. | |||
I'm also available for contract work. | |||
-- [Erez](https://github.com/erezsh) |
@@ -7,9 +7,11 @@ | |||
- [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language) | |||
- [fruitflies.py](fruitflies.py) - A demonstration of ambiguity | |||
- [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter. | |||
- [lark\_grammar.py](lark_grammar.py) + [lark.g](lark.g) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer) | |||
### Advanced | |||
- [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser | |||
- [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) | |||
- [conf.py](conf.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language | |||
- [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature |
@@ -0,0 +1,81 @@ | |||
# | |||
# This demonstrates example-driven error reporting with the LALR parser | |||
# | |||
from lark import Lark, UnexpectedToken | |||
from .json_parser import json_grammar # Using the grammar from the json_parser example | |||
json_parser = Lark(json_grammar, parser='lalr') | |||
class JsonSyntaxError(SyntaxError): | |||
def __str__(self): | |||
context, line, column = self.args | |||
return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context) | |||
class JsonMissingValue(JsonSyntaxError): | |||
label = 'Missing Value' | |||
class JsonMissingOpening(JsonSyntaxError): | |||
label = 'Missing Opening' | |||
class JsonMissingClosing(JsonSyntaxError): | |||
label = 'Missing Closing' | |||
class JsonMissingComma(JsonSyntaxError): | |||
label = 'Missing Comma' | |||
class JsonTrailingComma(JsonSyntaxError): | |||
label = 'Trailing Comma' | |||
def parse(json_text): | |||
try: | |||
j = json_parser.parse(json_text) | |||
except UnexpectedToken as ut: | |||
exc_class = ut.match_examples(json_parser.parse, { | |||
JsonMissingValue: ['{"foo": }'], | |||
JsonMissingOpening: ['{"foo": ]}', | |||
'{"foor": }}'], | |||
JsonMissingClosing: ['{"foo": [}', | |||
'{', | |||
'{"a": 1', | |||
'[1'], | |||
JsonMissingComma: ['[1 2]', | |||
'[false 1]', | |||
'["b" 1]', | |||
'{"a":true 1:4}', | |||
'{"a":1 1:4}', | |||
'{"a":"b" 1:4}'], | |||
JsonTrailingComma: ['[,]', | |||
'[1,]', | |||
'[1,2,]', | |||
'{"foo":1,}', | |||
'{"foo":false,"bar":true,}'] | |||
}) | |||
if not exc_class: | |||
raise | |||
raise exc_class(ut.get_context(json_text), ut.line, ut.column) | |||
def test(): | |||
try: | |||
parse('{"key":') | |||
except JsonMissingValue: | |||
pass | |||
try: | |||
parse('{"key": "value"') | |||
except JsonMissingClosing: | |||
pass | |||
try: | |||
parse('{"key": ] ') | |||
except JsonMissingOpening: | |||
pass | |||
if __name__ == '__main__': | |||
test() | |||
@@ -0,0 +1,49 @@ | |||
start: (_item | _NL)* | |||
_item: rule | |||
| token | |||
| statement | |||
rule: RULE priority? ":" expansions _NL | |||
token: TOKEN priority? ":" expansions _NL | |||
priority: "." NUMBER | |||
statement: "%ignore" expansions _NL -> ignore | |||
| "%import" import_args ["->" TOKEN] _NL -> import | |||
import_args: name ("." name)* | |||
?expansions: alias (_VBAR alias)* | |||
?alias: expansion ["->" RULE] | |||
?expansion: expr* | |||
?expr: atom [OP | "~" NUMBER [".." NUMBER]] | |||
?atom: "(" expansions ")" | |||
| "[" expansions "]" -> maybe | |||
| STRING ".." STRING -> literal_range | |||
| name | |||
| (REGEXP | STRING) -> literal | |||
name: RULE | |||
| TOKEN | |||
_VBAR: _NL? "|" | |||
OP: /[+*][?]?|[?](?![a-z])/ | |||
RULE: /!?[_?]?[a-z][_a-z0-9]*/ | |||
TOKEN: /_?[A-Z][_A-Z0-9]*/ | |||
STRING: _STRING "i"? | |||
REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/\n])*?\/[imslux]*/ | |||
_NL: /(\r?\n)+\s*/ | |||
%import common.ESCAPED_STRING -> _STRING | |||
%import common.INT -> NUMBER | |||
%import common.WS_INLINE | |||
COMMENT: "//" /[^\n]/* | |||
%ignore WS_INLINE | |||
%ignore COMMENT |
@@ -0,0 +1,18 @@ | |||
from lark import Lark | |||
parser = Lark(open('examples/lark.g'), parser="lalr") | |||
grammar_files = [ | |||
'examples/python2.g', | |||
'examples/python3.g', | |||
'examples/lark.g', | |||
'lark/grammars/common.g', | |||
] | |||
def test(): | |||
for grammar_file in grammar_files: | |||
tree = parser.parse(open(grammar_file).read()) | |||
print("All grammars parsed successfully") | |||
if __name__ == '__main__': | |||
test() |
@@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError | |||
from .lark import Lark | |||
from .utils import inline_args | |||
__version__ = "0.5.5" | |||
__version__ = "0.5.6" |
@@ -1,7 +1,7 @@ | |||
import re | |||
import sys | |||
from .utils import get_regexp_width | |||
from .utils import get_regexp_width, STRING_TYPE | |||
Py36 = (sys.version_info[:2] >= (3, 6)) | |||
@@ -17,12 +17,13 @@ class ParseError(Exception): | |||
pass | |||
class UnexpectedToken(ParseError): | |||
def __init__(self, token, expected, seq, index, considered_rules=None): | |||
def __init__(self, token, expected, seq, index, considered_rules=None, state=None): | |||
self.token = token | |||
self.expected = expected | |||
self.line = getattr(token, 'line', '?') | |||
self.column = getattr(token, 'column', '?') | |||
self.considered_rules = considered_rules | |||
self.state = state | |||
try: | |||
context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | |||
@@ -36,7 +37,36 @@ class UnexpectedToken(ParseError): | |||
super(UnexpectedToken, self).__init__(message) | |||
def match_examples(self, parse_fn, examples): | |||
""" Given a parser instance and a dictionary mapping some label with | |||
some malformed syntax examples, it'll return the label for the | |||
example that bests matches the current error. | |||
""" | |||
assert self.state, "Not supported for this exception" | |||
candidate = None | |||
for label, example in examples.items(): | |||
assert not isinstance(example, STRING_TYPE) | |||
for malformed in example: | |||
try: | |||
parse_fn(malformed) | |||
except UnexpectedToken as ut: | |||
if ut.state == self.state: | |||
if ut.token == self.token: # Try exact match first | |||
return label | |||
elif not candidate: | |||
candidate = label | |||
return candidate | |||
def get_context(self, text, span=10): | |||
pos = self.token.pos_in_stream | |||
start = max(pos - span, 0) | |||
end = pos + span | |||
before = text[start:pos].rsplit('\n', 1)[-1] | |||
after = text[pos:end].split('\n', 1)[0] | |||
return before + after + '\n' + ' ' * len(before) + '^\n' | |||
###} | |||
@@ -20,6 +20,7 @@ SIGNED_NUMBER: ["+"|"-"] NUMBER | |||
// | |||
// Strings | |||
// | |||
//STRING: /"(\\\"|\\\\|[^"\n])*?"i?/ | |||
STRING_INNER: ("\\\""|/[^"]/) | |||
ESCAPED_STRING: "\"" STRING_INNER* "\"" | |||
@@ -172,7 +172,7 @@ class Lark: | |||
def _build_parser(self): | |||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | |||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr') | |||
callback = self._parse_tree_builder.create_callback(self.options.transformer) | |||
if self.profiler: | |||
for f in dir(callback): | |||
@@ -25,6 +25,8 @@ class UnexpectedInput(LexError): | |||
self.considered_rules = considered_rules | |||
class Token(Str): | |||
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') | |||
def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): | |||
self = super(Token, cls).__new__(cls, value) | |||
self.type = type_ | |||
@@ -39,7 +41,7 @@ class Token(Str): | |||
return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) | |||
def __reduce__(self): | |||
return (self.__class__, (self.type, self.pos_in_stream, self.value, self.line, self.column, )) | |||
return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) | |||
def __repr__(self): | |||
return 'Token(%s, %r)' % (self.type, self.value) | |||
@@ -141,6 +143,8 @@ def _create_unless(tokens): | |||
for retok in tokens_by_type.get(PatternRE, []): | |||
unless = [] # {} | |||
for strtok in tokens_by_type.get(PatternStr, []): | |||
if strtok.priority > retok.priority: | |||
continue | |||
s = strtok.pattern.value | |||
m = re.match(retok.pattern.to_regexp(), s) | |||
if m and m.group(0) == s: | |||
@@ -14,7 +14,7 @@ from .parsers.lalr_parser import UnexpectedToken | |||
from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | |||
from .grammar import RuleOptions, Rule | |||
from .tree import Tree as T, Transformer, InlineTransformer, Visitor | |||
from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST | |||
__path__ = os.path.dirname(__file__) | |||
IMPORT_PATHS = [os.path.join(__path__, 'grammars')] | |||
@@ -122,7 +122,7 @@ RULES = { | |||
'statement': ['ignore', 'import'], | |||
'ignore': ['_IGNORE expansions _NL'], | |||
'import': ['_IMPORT import_args _NL', | |||
'_IMPORT import_args _TO TOKEN'], | |||
'_IMPORT import_args _TO TOKEN _NL'], | |||
'import_args': ['_import_args'], | |||
'_import_args': ['name', '_import_args _DOT name'], | |||
@@ -145,14 +145,14 @@ class EBNF_to_BNF(InlineTransformer): | |||
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | |||
self.i += 1 | |||
t = Token('RULE', new_name, -1) | |||
tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||
tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) | |||
self.new_rules.append((new_name, tree, self.rule_options)) | |||
self.rules_by_expr[expr] = t | |||
return t | |||
def expr(self, rule, op, *args): | |||
if op.value == '?': | |||
return T('expansions', [rule, T('expansion', [])]) | |||
return ST('expansions', [rule, ST('expansion', [])]) | |||
elif op.value == '+': | |||
# a : b c+ d | |||
# --> | |||
@@ -165,7 +165,7 @@ class EBNF_to_BNF(InlineTransformer): | |||
# a : b _c? d | |||
# _c : _c c | c; | |||
new_name = self._add_recurse_rule('star', rule) | |||
return T('expansions', [new_name, T('expansion', [])]) | |||
return ST('expansions', [new_name, ST('expansion', [])]) | |||
elif op.value == '~': | |||
if len(args) == 1: | |||
mn = mx = int(args[0]) | |||
@@ -173,7 +173,7 @@ class EBNF_to_BNF(InlineTransformer): | |||
mn, mx = map(int, args) | |||
if mx < mn: | |||
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) | |||
return T('expansions', [T('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||
assert False, op | |||
@@ -183,7 +183,7 @@ class SimplifyRule_Visitor(Visitor): | |||
def _flatten(tree): | |||
while True: | |||
to_expand = [i for i, child in enumerate(tree.children) | |||
if isinstance(child, T) and child.data == tree.data] | |||
if isinstance(child, Tree) and child.data == tree.data] | |||
if not to_expand: | |||
break | |||
tree.expand_kids_by_index(*to_expand) | |||
@@ -203,9 +203,9 @@ class SimplifyRule_Visitor(Visitor): | |||
self._flatten(tree) | |||
for i, child in enumerate(tree.children): | |||
if isinstance(child, T) and child.data == 'expansions': | |||
if isinstance(child, Tree) and child.data == 'expansions': | |||
tree.data = 'expansions' | |||
tree.children = [self.visit(T('expansion', [option if i==j else other | |||
tree.children = [self.visit(ST('expansion', [option if i==j else other | |||
for j, other in enumerate(tree.children)])) | |||
for option in set(child.children)] | |||
break | |||
@@ -217,7 +217,7 @@ class SimplifyRule_Visitor(Visitor): | |||
if rule.data == 'expansions': | |||
aliases = [] | |||
for child in tree.children[0].children: | |||
aliases.append(T('alias', [child, alias_name])) | |||
aliases.append(ST('alias', [child, alias_name])) | |||
tree.data = 'expansions' | |||
tree.children = aliases | |||
@@ -239,7 +239,7 @@ class RuleTreeToText(Transformer): | |||
class CanonizeTree(InlineTransformer): | |||
def maybe(self, expr): | |||
return T('expr', [expr, Token('OP', '?', -1)]) | |||
return ST('expr', [expr, Token('OP', '?', -1)]) | |||
def tokenmods(self, *args): | |||
if len(args) == 1: | |||
@@ -353,7 +353,7 @@ def _literal_to_pattern(literal): | |||
class PrepareLiterals(InlineTransformer): | |||
def literal(self, literal): | |||
return T('pattern', [_literal_to_pattern(literal)]) | |||
return ST('pattern', [_literal_to_pattern(literal)]) | |||
def range(self, start, end): | |||
assert start.type == end.type == 'STRING' | |||
@@ -361,13 +361,13 @@ class PrepareLiterals(InlineTransformer): | |||
end = end.value[1:-1] | |||
assert len(start) == len(end) == 1, (start, end, len(start), len(end)) | |||
regexp = '[%s-%s]' % (start, end) | |||
return T('pattern', [PatternRE(regexp)]) | |||
return ST('pattern', [PatternRE(regexp)]) | |||
class SplitLiterals(InlineTransformer): | |||
def pattern(self, p): | |||
if isinstance(p, PatternStr) and len(p.value)>1: | |||
return T('expansion', [T('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) | |||
return T('pattern', [p]) | |||
return ST('expansion', [ST('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) | |||
return ST('pattern', [p]) | |||
class TokenTreeToPattern(Transformer): | |||
def pattern(self, ps): | |||
@@ -375,6 +375,7 @@ class TokenTreeToPattern(Transformer): | |||
return p | |||
def expansion(self, items): | |||
assert items | |||
if len(items) == 1: | |||
return items[0] | |||
if len({i.flags for i in items}) > 1: | |||
@@ -402,18 +403,20 @@ class TokenTreeToPattern(Transformer): | |||
assert len(args) == 2 | |||
return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) | |||
def alias(self, t): | |||
raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") | |||
def _interleave(l, item): | |||
for e in l: | |||
yield e | |||
if isinstance(e, T): | |||
if isinstance(e, Tree): | |||
if e.data in ('literal', 'range'): | |||
yield item | |||
elif is_terminal(e): | |||
yield item | |||
def _choice_of_rules(rules): | |||
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | |||
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) | |||
class Grammar: | |||
def __init__(self, rule_defs, token_defs, ignore): | |||
@@ -440,9 +443,9 @@ class Grammar: | |||
if r == start: | |||
exp.children = [expr] + exp.children | |||
for exp in tree.find_data('expr'): | |||
exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr))) | |||
exp.children[0] = ST('expansion', list(_interleave(exp.children[:1], expr))) | |||
_ignore_tree = T('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) | |||
_ignore_tree = ST('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) | |||
rule_defs.append(('__ignore', _ignore_tree, None)) | |||
# Convert all tokens to rules | |||
@@ -455,6 +458,9 @@ class Grammar: | |||
exp.children[i] = Token(sym.type, new_terminal_names[sym]) | |||
for name, (tree, priority) in term_defs: # TODO transfer priority to rule? | |||
if any(tree.find_data('alias')): | |||
raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") | |||
if name.startswith('_'): | |||
options = RuleOptions(filter_out=True, priority=-priority) | |||
else: | |||
@@ -481,6 +487,11 @@ class Grammar: | |||
# Convert token-trees to strings/regexps | |||
transformer = PrepareLiterals() * TokenTreeToPattern() | |||
for name, (token_tree, priority) in token_defs: | |||
for t in token_tree.find_data('expansion'): | |||
if not t.children: | |||
raise GrammarError("Tokens cannot be empty (%s)" % name) | |||
tokens = [TokenDef(name, transformer.transform(token_tree), priority) | |||
for name, (token_tree, priority) in token_defs] | |||
@@ -516,7 +527,7 @@ class Grammar: | |||
for expansion, alias in expansions: | |||
if alias and name.startswith('_'): | |||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||
raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||
rule = Rule(name, expansion, alias, options) | |||
compiled_rules.append(rule) | |||
@@ -579,7 +590,7 @@ class GrammarLoader: | |||
rules = [options_from_rule(name, x) for name, x in RULES.items()] | |||
rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | |||
callback = ParseTreeBuilder(rules, T).create_callback() | |||
callback = ParseTreeBuilder(rules, ST).create_callback() | |||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | |||
parser_conf = ParserConf(rules, callback, 'start') | |||
@@ -595,14 +606,22 @@ class GrammarLoader: | |||
except UnexpectedInput as e: | |||
raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name)) | |||
except UnexpectedToken as e: | |||
if e.expected == ['_COLON']: | |||
raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column)) | |||
elif e.expected == ['RULE']: | |||
raise GrammarError("Missing alias at line %s column %s" % (e.line, e.column)) | |||
context = e.get_context(grammar_text) | |||
error = e.match_examples(self.parser.parse, { | |||
'Unclosed parenthesis': ['a: (\n'], | |||
'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'], | |||
'Expecting rule or token definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'], | |||
'Alias expects lowercase name': ['a: -> "a"\n'], | |||
'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'], | |||
'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'], | |||
'Expecting option ("|") or a new rule or token definition': ['a:a\n()\n'], | |||
'%import expects a name': ['%import "a"\n'], | |||
'%ignore expects a value': ['%ignore %import\n'], | |||
}) | |||
if error: | |||
raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context)) | |||
elif 'STRING' in e.expected: | |||
raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column)) | |||
elif e.expected == ['_OR']: | |||
raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column)) | |||
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) | |||
raise | |||
# Extract grammar items | |||
@@ -57,6 +57,19 @@ class ChildFilter: | |||
self.node_builder = node_builder | |||
self.to_include = to_include | |||
def __call__(self, children): | |||
filtered = [] | |||
for i, to_expand in self.to_include: | |||
if to_expand: | |||
filtered += children[i].children | |||
else: | |||
filtered.append(children[i]) | |||
return self.node_builder(filtered) | |||
class ChildFilterLALR(ChildFilter): | |||
"Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" | |||
def __call__(self, children): | |||
filtered = [] | |||
for i, to_expand in self.to_include: | |||
@@ -73,21 +86,22 @@ class ChildFilter: | |||
def _should_expand(sym): | |||
return not is_terminal(sym) and sym.startswith('_') | |||
def maybe_create_child_filter(expansion, filter_out): | |||
def maybe_create_child_filter(expansion, filter_out, ambiguous): | |||
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] | |||
if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | |||
return partial(ChildFilter, to_include) | |||
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) | |||
class Callback(object): | |||
pass | |||
class ParseTreeBuilder: | |||
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): | |||
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): | |||
self.tree_class = tree_class | |||
self.propagate_positions = propagate_positions | |||
self.always_keep_all_tokens = keep_all_tokens | |||
self.ambiguous = ambiguous | |||
self.rule_builders = list(self._init_builders(rules)) | |||
@@ -107,7 +121,7 @@ class ParseTreeBuilder: | |||
wrapper_chain = filter(None, [ | |||
create_token and partial(CreateToken, create_token), | |||
(expand_single_child and not rule.alias) and ExpandSingleChild, | |||
maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out), | |||
maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out, self.ambiguous), | |||
self.propagate_positions and PropagatePositions, | |||
]) | |||
@@ -15,9 +15,9 @@ class WithLexer: | |||
def init_contextual_lexer(self, lexer_conf, parser_conf): | |||
self.lexer_conf = lexer_conf | |||
d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} | |||
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} | |||
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | |||
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) | |||
self.lexer = ContextualLexer(lexer_conf.tokens, states, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) | |||
def lex(self, text): | |||
stream = self.lexer.lex(text) | |||
@@ -145,16 +145,16 @@ class Column: | |||
class Parser: | |||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | |||
self.analysis = GrammarAnalyzer(parser_conf) | |||
analysis = GrammarAnalyzer(parser_conf) | |||
self.parser_conf = parser_conf | |||
self.resolve_ambiguity = resolve_ambiguity | |||
self.FIRST = self.analysis.FIRST | |||
self.FIRST = analysis.FIRST | |||
self.postprocess = {} | |||
self.predictions = {} | |||
for rule in parser_conf.rules: | |||
self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | |||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | |||
self.term_matcher = term_matcher | |||
@@ -5,6 +5,8 @@ from ..grammar import Rule | |||
class RulePtr(object): | |||
__slots__ = ('rule', 'index') | |||
def __init__(self, rule, index): | |||
assert isinstance(rule, Rule) | |||
assert index <= len(rule.expansion) | |||
@@ -134,7 +136,8 @@ class GrammarAnalyzer(object): | |||
if not is_terminal(new_r): | |||
yield new_r | |||
_ = list(bfs([rule], _expand_rule)) | |||
for _ in bfs([rule], _expand_rule): | |||
pass | |||
return fzset(init_ptrs) | |||
@@ -2,7 +2,6 @@ | |||
""" | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
from ..common import UnexpectedToken | |||
from .lalr_analysis import LALR_Analyzer, Shift | |||
@@ -11,11 +10,12 @@ class Parser: | |||
def __init__(self, parser_conf): | |||
assert all(r.options is None or r.options.priority is None | |||
for r in parser_conf.rules), "LALR doesn't yet support prioritization" | |||
self.analysis = analysis = LALR_Analyzer(parser_conf) | |||
analysis = LALR_Analyzer(parser_conf) | |||
analysis.compute_lookahead() | |||
callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | |||
for rule in parser_conf.rules} | |||
self._parse_table = analysis.parse_table | |||
self.parser_conf = parser_conf | |||
self.parser = _Parser(analysis.parse_table, callbacks) | |||
self.parse = self.parser.parse | |||
@@ -46,8 +46,7 @@ class _Parser: | |||
return states[state][key] | |||
except KeyError: | |||
expected = states[state].keys() | |||
raise UnexpectedToken(token, expected, seq, i) | |||
raise UnexpectedToken(token, expected, seq, i, state=state) | |||
def reduce(rule): | |||
size = len(rule.expansion) | |||
@@ -9,11 +9,7 @@ from ..tree import Tree, Visitor_NoRecurse | |||
# Author: Erez Sh | |||
def _compare_rules(rule1, rule2): | |||
c = -compare( len(rule1.expansion), len(rule2.expansion)) | |||
if rule1.origin.startswith('__'): # XXX hack! We should set priority in parser, not here | |||
c = -c | |||
return c | |||
return -compare( len(rule1.expansion), len(rule2.expansion)) | |||
def _sum_priority(tree): | |||
p = 0 | |||
@@ -126,7 +126,7 @@ def _get_token_type(token_type): | |||
class ParserAtoms: | |||
def __init__(self, parser): | |||
self.parse_table = parser.analysis.parse_table | |||
self.parse_table = parser._parse_table | |||
def print_python(self): | |||
print('class ParseTable: pass') | |||
@@ -99,6 +99,8 @@ class Tree(object): | |||
self.data = data | |||
self.children = children | |||
class SlottedTree(Tree): | |||
__slots__ = 'data', 'children', 'rule' | |||
###{standalone | |||
@@ -172,6 +174,30 @@ class Visitor_NoRecurse(Visitor): | |||
return tree | |||
from functools import wraps | |||
def visit_children_decor(func): | |||
@wraps(func) | |||
def inner(cls, tree): | |||
values = cls.visit_children(tree) | |||
return func(cls, values) | |||
return inner | |||
class Interpreter(object): | |||
def visit(self, tree): | |||
return getattr(self, tree.data)(tree) | |||
def visit_children(self, tree): | |||
return [self.visit(child) if isinstance(child, Tree) else child | |||
for child in tree.children] | |||
def __getattr__(self, name): | |||
return self.__default__ | |||
def __default__(self, tree): | |||
return self.visit_children(tree) | |||
class Transformer_NoRecurse(Transformer): | |||
def transform(self, tree): | |||
subtrees = list(tree.iter_subtrees()) | |||
@@ -187,17 +187,22 @@ def _make_full_earley_test(LEXER): | |||
l.parse(program) | |||
def test_earley_scanless3(self): | |||
"Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||
# XXX Fails for scanless mode | |||
# XXX Decided not to fix, because | |||
# a) It's a subtle bug | |||
# b) Scanless is intended for deprecation | |||
# | |||
# def test_earley_scanless3(self): | |||
# "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||
grammar = """ | |||
start: A A | |||
A: "a"+ | |||
""" | |||
# grammar = """ | |||
# start: A A | |||
# A: "a"+ | |||
# """ | |||
l = Lark(grammar, parser='earley', lexer=LEXER) | |||
res = l.parse("aaa") | |||
self.assertEqual(res.children, ['aa', 'a']) | |||
# l = Lark(grammar, parser='earley', lexer=LEXER) | |||
# res = l.parse("aaa") | |||
# self.assertEqual(res.children, ['aa', 'a']) | |||
def test_earley_scanless4(self): | |||
grammar = """ | |||
@@ -293,6 +298,39 @@ def _make_full_earley_test(LEXER): | |||
self.assertEqual(res, expected) | |||
def test_explicit_ambiguity2(self): | |||
grammar = r""" | |||
start: NAME+ | |||
NAME: /\w+/ | |||
%ignore " " | |||
""" | |||
text = """cat""" | |||
parser = Lark(grammar, start='start', ambiguity='explicit') | |||
tree = parser.parse(text) | |||
self.assertEqual(tree.data, '_ambig') | |||
combinations = {tuple(str(s) for s in t.children) for t in tree.children} | |||
self.assertEqual(combinations, { | |||
('cat',), | |||
('ca', 't'), | |||
('c', 'at'), | |||
('c', 'a' ,'t') | |||
}) | |||
def test_term_ambig_resolve(self): | |||
grammar = r""" | |||
!start: NAME+ | |||
NAME: /\w+/ | |||
%ignore " " | |||
""" | |||
text = """foo bar""" | |||
parser = Lark(grammar) | |||
tree = parser.parse(text) | |||
self.assertEqual(tree.children, ['foo', 'bar']) | |||
@@ -822,6 +860,12 @@ def _make_parser_test(LEXER, PARSER): | |||
""" | |||
self.assertRaises( GrammarError, _Lark, g) | |||
def test_alias_in_terminal(self): | |||
g = """start: TERM | |||
TERM: "a" -> alias | |||
""" | |||
self.assertRaises( GrammarError, _Lark, g) | |||
@unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO | |||
def test_line_and_column(self): | |||
g = r"""!start: "A" bc "D" | |||
@@ -1129,6 +1173,18 @@ def _make_parser_test(LEXER, PARSER): | |||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') | |||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | |||
@unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX | |||
def test_priority_vs_embedded(self): | |||
g = """ | |||
A.2: "a" | |||
WORD: ("a".."z")+ | |||
start: (A | WORD)+ | |||
""" | |||
l = _Lark(g) | |||
t = l.parse('abc') | |||
self.assertEqual(t.children, ['a', 'bc']) | |||
self.assertEqual(t.children[0].type, 'A') | |||
@@ -5,7 +5,7 @@ from unittest import TestCase | |||
import copy | |||
import pickle | |||
from lark.tree import Tree | |||
from lark.tree import Tree, Interpreter, visit_children_decor | |||
class TestTrees(TestCase): | |||
@@ -21,6 +21,45 @@ class TestTrees(TestCase): | |||
assert pickle.loads(data) == s | |||
def test_interp(self): | |||
t = Tree('a', [Tree('b', []), Tree('c', []), 'd']) | |||
class Interp1(Interpreter): | |||
def a(self, tree): | |||
return self.visit_children(tree) + ['e'] | |||
def b(self, tree): | |||
return 'B' | |||
def c(self, tree): | |||
return 'C' | |||
self.assertEqual(Interp1().visit(t), list('BCde')) | |||
class Interp2(Interpreter): | |||
@visit_children_decor | |||
def a(self, values): | |||
return values + ['e'] | |||
def b(self, tree): | |||
return 'B' | |||
def c(self, tree): | |||
return 'C' | |||
self.assertEqual(Interp2().visit(t), list('BCde')) | |||
class Interp3(Interpreter): | |||
def b(self, tree): | |||
return 'B' | |||
def c(self, tree): | |||
return 'C' | |||
self.assertEqual(Interp3().visit(t), list('BCd')) | |||
if __name__ == '__main__': | |||
unittest.main() | |||