- Merging updated upstream into branch for file extension changes. - Will push so Pull Request has no remaining conflicts. - Also will change the file type of lark example grammar.tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.0
@@ -165,3 +165,5 @@ If you're interested in taking one of these on, let me know and I will provide m | |||||
If you have any questions or want my assistance, you can email me at erezshin at gmail com. | If you have any questions or want my assistance, you can email me at erezshin at gmail com. | ||||
I'm also available for contract work. | I'm also available for contract work. | ||||
-- [Erez](https://github.com/erezsh) |
@@ -7,9 +7,11 @@ | |||||
- [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language) | - [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language) | ||||
- [fruitflies.py](fruitflies.py) - A demonstration of ambiguity | - [fruitflies.py](fruitflies.py) - A demonstration of ambiguity | ||||
- [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter. | - [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter. | ||||
- [lark\_grammar.py](lark_grammar.py) + [lark.g](lark.g) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer) | |||||
### Advanced | ### Advanced | ||||
- [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser | |||||
- [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) | - [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) | ||||
- [conf.py](conf.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language | - [conf.py](conf.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language | ||||
- [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature | - [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature |
@@ -0,0 +1,81 @@ | |||||
# | |||||
# This demonstrates example-driven error reporting with the LALR parser | |||||
# | |||||
from lark import Lark, UnexpectedToken | |||||
from .json_parser import json_grammar # Using the grammar from the json_parser example | |||||
json_parser = Lark(json_grammar, parser='lalr') | |||||
class JsonSyntaxError(SyntaxError): | |||||
def __str__(self): | |||||
context, line, column = self.args | |||||
return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context) | |||||
class JsonMissingValue(JsonSyntaxError): | |||||
label = 'Missing Value' | |||||
class JsonMissingOpening(JsonSyntaxError): | |||||
label = 'Missing Opening' | |||||
class JsonMissingClosing(JsonSyntaxError): | |||||
label = 'Missing Closing' | |||||
class JsonMissingComma(JsonSyntaxError): | |||||
label = 'Missing Comma' | |||||
class JsonTrailingComma(JsonSyntaxError): | |||||
label = 'Trailing Comma' | |||||
def parse(json_text): | |||||
try: | |||||
j = json_parser.parse(json_text) | |||||
except UnexpectedToken as ut: | |||||
exc_class = ut.match_examples(json_parser.parse, { | |||||
JsonMissingValue: ['{"foo": }'], | |||||
JsonMissingOpening: ['{"foo": ]}', | |||||
'{"foor": }}'], | |||||
JsonMissingClosing: ['{"foo": [}', | |||||
'{', | |||||
'{"a": 1', | |||||
'[1'], | |||||
JsonMissingComma: ['[1 2]', | |||||
'[false 1]', | |||||
'["b" 1]', | |||||
'{"a":true 1:4}', | |||||
'{"a":1 1:4}', | |||||
'{"a":"b" 1:4}'], | |||||
JsonTrailingComma: ['[,]', | |||||
'[1,]', | |||||
'[1,2,]', | |||||
'{"foo":1,}', | |||||
'{"foo":false,"bar":true,}'] | |||||
}) | |||||
if not exc_class: | |||||
raise | |||||
raise exc_class(ut.get_context(json_text), ut.line, ut.column) | |||||
def test(): | |||||
try: | |||||
parse('{"key":') | |||||
except JsonMissingValue: | |||||
pass | |||||
try: | |||||
parse('{"key": "value"') | |||||
except JsonMissingClosing: | |||||
pass | |||||
try: | |||||
parse('{"key": ] ') | |||||
except JsonMissingOpening: | |||||
pass | |||||
if __name__ == '__main__': | |||||
test() | |||||
@@ -0,0 +1,49 @@ | |||||
start: (_item | _NL)* | |||||
_item: rule | |||||
| token | |||||
| statement | |||||
rule: RULE priority? ":" expansions _NL | |||||
token: TOKEN priority? ":" expansions _NL | |||||
priority: "." NUMBER | |||||
statement: "%ignore" expansions _NL -> ignore | |||||
| "%import" import_args ["->" TOKEN] _NL -> import | |||||
import_args: name ("." name)* | |||||
?expansions: alias (_VBAR alias)* | |||||
?alias: expansion ["->" RULE] | |||||
?expansion: expr* | |||||
?expr: atom [OP | "~" NUMBER [".." NUMBER]] | |||||
?atom: "(" expansions ")" | |||||
| "[" expansions "]" -> maybe | |||||
| STRING ".." STRING -> literal_range | |||||
| name | |||||
| (REGEXP | STRING) -> literal | |||||
name: RULE | |||||
| TOKEN | |||||
_VBAR: _NL? "|" | |||||
OP: /[+*][?]?|[?](?![a-z])/ | |||||
RULE: /!?[_?]?[a-z][_a-z0-9]*/ | |||||
TOKEN: /_?[A-Z][_A-Z0-9]*/ | |||||
STRING: _STRING "i"? | |||||
REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/\n])*?\/[imslux]*/ | |||||
_NL: /(\r?\n)+\s*/ | |||||
%import common.ESCAPED_STRING -> _STRING | |||||
%import common.INT -> NUMBER | |||||
%import common.WS_INLINE | |||||
COMMENT: "//" /[^\n]/* | |||||
%ignore WS_INLINE | |||||
%ignore COMMENT |
@@ -0,0 +1,18 @@ | |||||
from lark import Lark | |||||
parser = Lark(open('examples/lark.g'), parser="lalr") | |||||
grammar_files = [ | |||||
'examples/python2.g', | |||||
'examples/python3.g', | |||||
'examples/lark.g', | |||||
'lark/grammars/common.g', | |||||
] | |||||
def test(): | |||||
for grammar_file in grammar_files: | |||||
tree = parser.parse(open(grammar_file).read()) | |||||
print("All grammars parsed successfully") | |||||
if __name__ == '__main__': | |||||
test() |
@@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError | |||||
from .lark import Lark | from .lark import Lark | ||||
from .utils import inline_args | from .utils import inline_args | ||||
__version__ = "0.5.5" | |||||
__version__ = "0.5.6" |
@@ -1,7 +1,7 @@ | |||||
import re | import re | ||||
import sys | import sys | ||||
from .utils import get_regexp_width | |||||
from .utils import get_regexp_width, STRING_TYPE | |||||
Py36 = (sys.version_info[:2] >= (3, 6)) | Py36 = (sys.version_info[:2] >= (3, 6)) | ||||
@@ -17,12 +17,13 @@ class ParseError(Exception): | |||||
pass | pass | ||||
class UnexpectedToken(ParseError): | class UnexpectedToken(ParseError): | ||||
def __init__(self, token, expected, seq, index, considered_rules=None): | |||||
def __init__(self, token, expected, seq, index, considered_rules=None, state=None): | |||||
self.token = token | self.token = token | ||||
self.expected = expected | self.expected = expected | ||||
self.line = getattr(token, 'line', '?') | self.line = getattr(token, 'line', '?') | ||||
self.column = getattr(token, 'column', '?') | self.column = getattr(token, 'column', '?') | ||||
self.considered_rules = considered_rules | self.considered_rules = considered_rules | ||||
self.state = state | |||||
try: | try: | ||||
context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | ||||
@@ -36,7 +37,36 @@ class UnexpectedToken(ParseError): | |||||
super(UnexpectedToken, self).__init__(message) | super(UnexpectedToken, self).__init__(message) | ||||
def match_examples(self, parse_fn, examples): | |||||
""" Given a parser instance and a dictionary mapping some label with | |||||
some malformed syntax examples, it'll return the label for the | |||||
example that bests matches the current error. | |||||
""" | |||||
assert self.state, "Not supported for this exception" | |||||
candidate = None | |||||
for label, example in examples.items(): | |||||
assert not isinstance(example, STRING_TYPE) | |||||
for malformed in example: | |||||
try: | |||||
parse_fn(malformed) | |||||
except UnexpectedToken as ut: | |||||
if ut.state == self.state: | |||||
if ut.token == self.token: # Try exact match first | |||||
return label | |||||
elif not candidate: | |||||
candidate = label | |||||
return candidate | |||||
def get_context(self, text, span=10): | |||||
pos = self.token.pos_in_stream | |||||
start = max(pos - span, 0) | |||||
end = pos + span | |||||
before = text[start:pos].rsplit('\n', 1)[-1] | |||||
after = text[pos:end].split('\n', 1)[0] | |||||
return before + after + '\n' + ' ' * len(before) + '^\n' | |||||
###} | ###} | ||||
@@ -20,6 +20,7 @@ SIGNED_NUMBER: ["+"|"-"] NUMBER | |||||
// | // | ||||
// Strings | // Strings | ||||
// | // | ||||
//STRING: /"(\\\"|\\\\|[^"\n])*?"i?/ | |||||
STRING_INNER: ("\\\""|/[^"]/) | STRING_INNER: ("\\\""|/[^"]/) | ||||
ESCAPED_STRING: "\"" STRING_INNER* "\"" | ESCAPED_STRING: "\"" STRING_INNER* "\"" | ||||
@@ -172,7 +172,7 @@ class Lark: | |||||
def _build_parser(self): | def _build_parser(self): | ||||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr') | |||||
callback = self._parse_tree_builder.create_callback(self.options.transformer) | callback = self._parse_tree_builder.create_callback(self.options.transformer) | ||||
if self.profiler: | if self.profiler: | ||||
for f in dir(callback): | for f in dir(callback): | ||||
@@ -25,6 +25,8 @@ class UnexpectedInput(LexError): | |||||
self.considered_rules = considered_rules | self.considered_rules = considered_rules | ||||
class Token(Str): | class Token(Str): | ||||
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') | |||||
def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): | def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): | ||||
self = super(Token, cls).__new__(cls, value) | self = super(Token, cls).__new__(cls, value) | ||||
self.type = type_ | self.type = type_ | ||||
@@ -39,7 +41,7 @@ class Token(Str): | |||||
return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) | return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) | ||||
def __reduce__(self): | def __reduce__(self): | ||||
return (self.__class__, (self.type, self.pos_in_stream, self.value, self.line, self.column, )) | |||||
return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) | |||||
def __repr__(self): | def __repr__(self): | ||||
return 'Token(%s, %r)' % (self.type, self.value) | return 'Token(%s, %r)' % (self.type, self.value) | ||||
@@ -141,6 +143,8 @@ def _create_unless(tokens): | |||||
for retok in tokens_by_type.get(PatternRE, []): | for retok in tokens_by_type.get(PatternRE, []): | ||||
unless = [] # {} | unless = [] # {} | ||||
for strtok in tokens_by_type.get(PatternStr, []): | for strtok in tokens_by_type.get(PatternStr, []): | ||||
if strtok.priority > retok.priority: | |||||
continue | |||||
s = strtok.pattern.value | s = strtok.pattern.value | ||||
m = re.match(retok.pattern.to_regexp(), s) | m = re.match(retok.pattern.to_regexp(), s) | ||||
if m and m.group(0) == s: | if m and m.group(0) == s: | ||||
@@ -14,7 +14,7 @@ from .parsers.lalr_parser import UnexpectedToken | |||||
from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | ||||
from .grammar import RuleOptions, Rule | from .grammar import RuleOptions, Rule | ||||
from .tree import Tree as T, Transformer, InlineTransformer, Visitor | |||||
from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST | |||||
__path__ = os.path.dirname(__file__) | __path__ = os.path.dirname(__file__) | ||||
IMPORT_PATHS = [os.path.join(__path__, 'grammars')] | IMPORT_PATHS = [os.path.join(__path__, 'grammars')] | ||||
@@ -122,7 +122,7 @@ RULES = { | |||||
'statement': ['ignore', 'import'], | 'statement': ['ignore', 'import'], | ||||
'ignore': ['_IGNORE expansions _NL'], | 'ignore': ['_IGNORE expansions _NL'], | ||||
'import': ['_IMPORT import_args _NL', | 'import': ['_IMPORT import_args _NL', | ||||
'_IMPORT import_args _TO TOKEN'], | |||||
'_IMPORT import_args _TO TOKEN _NL'], | |||||
'import_args': ['_import_args'], | 'import_args': ['_import_args'], | ||||
'_import_args': ['name', '_import_args _DOT name'], | '_import_args': ['name', '_import_args _DOT name'], | ||||
@@ -145,14 +145,14 @@ class EBNF_to_BNF(InlineTransformer): | |||||
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | ||||
self.i += 1 | self.i += 1 | ||||
t = Token('RULE', new_name, -1) | t = Token('RULE', new_name, -1) | ||||
tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||||
tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) | |||||
self.new_rules.append((new_name, tree, self.rule_options)) | self.new_rules.append((new_name, tree, self.rule_options)) | ||||
self.rules_by_expr[expr] = t | self.rules_by_expr[expr] = t | ||||
return t | return t | ||||
def expr(self, rule, op, *args): | def expr(self, rule, op, *args): | ||||
if op.value == '?': | if op.value == '?': | ||||
return T('expansions', [rule, T('expansion', [])]) | |||||
return ST('expansions', [rule, ST('expansion', [])]) | |||||
elif op.value == '+': | elif op.value == '+': | ||||
# a : b c+ d | # a : b c+ d | ||||
# --> | # --> | ||||
@@ -165,7 +165,7 @@ class EBNF_to_BNF(InlineTransformer): | |||||
# a : b _c? d | # a : b _c? d | ||||
# _c : _c c | c; | # _c : _c c | c; | ||||
new_name = self._add_recurse_rule('star', rule) | new_name = self._add_recurse_rule('star', rule) | ||||
return T('expansions', [new_name, T('expansion', [])]) | |||||
return ST('expansions', [new_name, ST('expansion', [])]) | |||||
elif op.value == '~': | elif op.value == '~': | ||||
if len(args) == 1: | if len(args) == 1: | ||||
mn = mx = int(args[0]) | mn = mx = int(args[0]) | ||||
@@ -173,7 +173,7 @@ class EBNF_to_BNF(InlineTransformer): | |||||
mn, mx = map(int, args) | mn, mx = map(int, args) | ||||
if mx < mn: | if mx < mn: | ||||
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) | raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) | ||||
return T('expansions', [T('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||||
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||||
assert False, op | assert False, op | ||||
@@ -183,7 +183,7 @@ class SimplifyRule_Visitor(Visitor): | |||||
def _flatten(tree): | def _flatten(tree): | ||||
while True: | while True: | ||||
to_expand = [i for i, child in enumerate(tree.children) | to_expand = [i for i, child in enumerate(tree.children) | ||||
if isinstance(child, T) and child.data == tree.data] | |||||
if isinstance(child, Tree) and child.data == tree.data] | |||||
if not to_expand: | if not to_expand: | ||||
break | break | ||||
tree.expand_kids_by_index(*to_expand) | tree.expand_kids_by_index(*to_expand) | ||||
@@ -203,9 +203,9 @@ class SimplifyRule_Visitor(Visitor): | |||||
self._flatten(tree) | self._flatten(tree) | ||||
for i, child in enumerate(tree.children): | for i, child in enumerate(tree.children): | ||||
if isinstance(child, T) and child.data == 'expansions': | |||||
if isinstance(child, Tree) and child.data == 'expansions': | |||||
tree.data = 'expansions' | tree.data = 'expansions' | ||||
tree.children = [self.visit(T('expansion', [option if i==j else other | |||||
tree.children = [self.visit(ST('expansion', [option if i==j else other | |||||
for j, other in enumerate(tree.children)])) | for j, other in enumerate(tree.children)])) | ||||
for option in set(child.children)] | for option in set(child.children)] | ||||
break | break | ||||
@@ -217,7 +217,7 @@ class SimplifyRule_Visitor(Visitor): | |||||
if rule.data == 'expansions': | if rule.data == 'expansions': | ||||
aliases = [] | aliases = [] | ||||
for child in tree.children[0].children: | for child in tree.children[0].children: | ||||
aliases.append(T('alias', [child, alias_name])) | |||||
aliases.append(ST('alias', [child, alias_name])) | |||||
tree.data = 'expansions' | tree.data = 'expansions' | ||||
tree.children = aliases | tree.children = aliases | ||||
@@ -239,7 +239,7 @@ class RuleTreeToText(Transformer): | |||||
class CanonizeTree(InlineTransformer): | class CanonizeTree(InlineTransformer): | ||||
def maybe(self, expr): | def maybe(self, expr): | ||||
return T('expr', [expr, Token('OP', '?', -1)]) | |||||
return ST('expr', [expr, Token('OP', '?', -1)]) | |||||
def tokenmods(self, *args): | def tokenmods(self, *args): | ||||
if len(args) == 1: | if len(args) == 1: | ||||
@@ -353,7 +353,7 @@ def _literal_to_pattern(literal): | |||||
class PrepareLiterals(InlineTransformer): | class PrepareLiterals(InlineTransformer): | ||||
def literal(self, literal): | def literal(self, literal): | ||||
return T('pattern', [_literal_to_pattern(literal)]) | |||||
return ST('pattern', [_literal_to_pattern(literal)]) | |||||
def range(self, start, end): | def range(self, start, end): | ||||
assert start.type == end.type == 'STRING' | assert start.type == end.type == 'STRING' | ||||
@@ -361,13 +361,13 @@ class PrepareLiterals(InlineTransformer): | |||||
end = end.value[1:-1] | end = end.value[1:-1] | ||||
assert len(start) == len(end) == 1, (start, end, len(start), len(end)) | assert len(start) == len(end) == 1, (start, end, len(start), len(end)) | ||||
regexp = '[%s-%s]' % (start, end) | regexp = '[%s-%s]' % (start, end) | ||||
return T('pattern', [PatternRE(regexp)]) | |||||
return ST('pattern', [PatternRE(regexp)]) | |||||
class SplitLiterals(InlineTransformer): | class SplitLiterals(InlineTransformer): | ||||
def pattern(self, p): | def pattern(self, p): | ||||
if isinstance(p, PatternStr) and len(p.value)>1: | if isinstance(p, PatternStr) and len(p.value)>1: | ||||
return T('expansion', [T('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) | |||||
return T('pattern', [p]) | |||||
return ST('expansion', [ST('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) | |||||
return ST('pattern', [p]) | |||||
class TokenTreeToPattern(Transformer): | class TokenTreeToPattern(Transformer): | ||||
def pattern(self, ps): | def pattern(self, ps): | ||||
@@ -375,6 +375,7 @@ class TokenTreeToPattern(Transformer): | |||||
return p | return p | ||||
def expansion(self, items): | def expansion(self, items): | ||||
assert items | |||||
if len(items) == 1: | if len(items) == 1: | ||||
return items[0] | return items[0] | ||||
if len({i.flags for i in items}) > 1: | if len({i.flags for i in items}) > 1: | ||||
@@ -402,18 +403,20 @@ class TokenTreeToPattern(Transformer): | |||||
assert len(args) == 2 | assert len(args) == 2 | ||||
return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) | return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) | ||||
def alias(self, t): | |||||
raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") | |||||
def _interleave(l, item): | def _interleave(l, item): | ||||
for e in l: | for e in l: | ||||
yield e | yield e | ||||
if isinstance(e, T): | |||||
if isinstance(e, Tree): | |||||
if e.data in ('literal', 'range'): | if e.data in ('literal', 'range'): | ||||
yield item | yield item | ||||
elif is_terminal(e): | elif is_terminal(e): | ||||
yield item | yield item | ||||
def _choice_of_rules(rules): | def _choice_of_rules(rules): | ||||
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | |||||
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) | |||||
class Grammar: | class Grammar: | ||||
def __init__(self, rule_defs, token_defs, ignore): | def __init__(self, rule_defs, token_defs, ignore): | ||||
@@ -440,9 +443,9 @@ class Grammar: | |||||
if r == start: | if r == start: | ||||
exp.children = [expr] + exp.children | exp.children = [expr] + exp.children | ||||
for exp in tree.find_data('expr'): | for exp in tree.find_data('expr'): | ||||
exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr))) | |||||
exp.children[0] = ST('expansion', list(_interleave(exp.children[:1], expr))) | |||||
_ignore_tree = T('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) | |||||
_ignore_tree = ST('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) | |||||
rule_defs.append(('__ignore', _ignore_tree, None)) | rule_defs.append(('__ignore', _ignore_tree, None)) | ||||
# Convert all tokens to rules | # Convert all tokens to rules | ||||
@@ -455,6 +458,9 @@ class Grammar: | |||||
exp.children[i] = Token(sym.type, new_terminal_names[sym]) | exp.children[i] = Token(sym.type, new_terminal_names[sym]) | ||||
for name, (tree, priority) in term_defs: # TODO transfer priority to rule? | for name, (tree, priority) in term_defs: # TODO transfer priority to rule? | ||||
if any(tree.find_data('alias')): | |||||
raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") | |||||
if name.startswith('_'): | if name.startswith('_'): | ||||
options = RuleOptions(filter_out=True, priority=-priority) | options = RuleOptions(filter_out=True, priority=-priority) | ||||
else: | else: | ||||
@@ -481,6 +487,11 @@ class Grammar: | |||||
# Convert token-trees to strings/regexps | # Convert token-trees to strings/regexps | ||||
transformer = PrepareLiterals() * TokenTreeToPattern() | transformer = PrepareLiterals() * TokenTreeToPattern() | ||||
for name, (token_tree, priority) in token_defs: | |||||
for t in token_tree.find_data('expansion'): | |||||
if not t.children: | |||||
raise GrammarError("Tokens cannot be empty (%s)" % name) | |||||
tokens = [TokenDef(name, transformer.transform(token_tree), priority) | tokens = [TokenDef(name, transformer.transform(token_tree), priority) | ||||
for name, (token_tree, priority) in token_defs] | for name, (token_tree, priority) in token_defs] | ||||
@@ -516,7 +527,7 @@ class Grammar: | |||||
for expansion, alias in expansions: | for expansion, alias in expansions: | ||||
if alias and name.startswith('_'): | if alias and name.startswith('_'): | ||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||||
raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||||
rule = Rule(name, expansion, alias, options) | rule = Rule(name, expansion, alias, options) | ||||
compiled_rules.append(rule) | compiled_rules.append(rule) | ||||
@@ -579,7 +590,7 @@ class GrammarLoader: | |||||
rules = [options_from_rule(name, x) for name, x in RULES.items()] | rules = [options_from_rule(name, x) for name, x in RULES.items()] | ||||
rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | ||||
callback = ParseTreeBuilder(rules, T).create_callback() | |||||
callback = ParseTreeBuilder(rules, ST).create_callback() | |||||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | ||||
parser_conf = ParserConf(rules, callback, 'start') | parser_conf = ParserConf(rules, callback, 'start') | ||||
@@ -595,14 +606,22 @@ class GrammarLoader: | |||||
except UnexpectedInput as e: | except UnexpectedInput as e: | ||||
raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name)) | raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name)) | ||||
except UnexpectedToken as e: | except UnexpectedToken as e: | ||||
if e.expected == ['_COLON']: | |||||
raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column)) | |||||
elif e.expected == ['RULE']: | |||||
raise GrammarError("Missing alias at line %s column %s" % (e.line, e.column)) | |||||
context = e.get_context(grammar_text) | |||||
error = e.match_examples(self.parser.parse, { | |||||
'Unclosed parenthesis': ['a: (\n'], | |||||
'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'], | |||||
'Expecting rule or token definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'], | |||||
'Alias expects lowercase name': ['a: -> "a"\n'], | |||||
'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'], | |||||
'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'], | |||||
'Expecting option ("|") or a new rule or token definition': ['a:a\n()\n'], | |||||
'%import expects a name': ['%import "a"\n'], | |||||
'%ignore expects a value': ['%ignore %import\n'], | |||||
}) | |||||
if error: | |||||
raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context)) | |||||
elif 'STRING' in e.expected: | elif 'STRING' in e.expected: | ||||
raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column)) | |||||
elif e.expected == ['_OR']: | |||||
raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column)) | |||||
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) | |||||
raise | raise | ||||
# Extract grammar items | # Extract grammar items | ||||
@@ -57,6 +57,19 @@ class ChildFilter: | |||||
self.node_builder = node_builder | self.node_builder = node_builder | ||||
self.to_include = to_include | self.to_include = to_include | ||||
def __call__(self, children): | |||||
filtered = [] | |||||
for i, to_expand in self.to_include: | |||||
if to_expand: | |||||
filtered += children[i].children | |||||
else: | |||||
filtered.append(children[i]) | |||||
return self.node_builder(filtered) | |||||
class ChildFilterLALR(ChildFilter): | |||||
"Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" | |||||
def __call__(self, children): | def __call__(self, children): | ||||
filtered = [] | filtered = [] | ||||
for i, to_expand in self.to_include: | for i, to_expand in self.to_include: | ||||
@@ -73,21 +86,22 @@ class ChildFilter: | |||||
def _should_expand(sym): | def _should_expand(sym): | ||||
return not is_terminal(sym) and sym.startswith('_') | return not is_terminal(sym) and sym.startswith('_') | ||||
def maybe_create_child_filter(expansion, filter_out): | |||||
def maybe_create_child_filter(expansion, filter_out, ambiguous): | |||||
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] | to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] | ||||
if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | ||||
return partial(ChildFilter, to_include) | |||||
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) | |||||
class Callback(object): | class Callback(object): | ||||
pass | pass | ||||
class ParseTreeBuilder: | class ParseTreeBuilder: | ||||
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): | |||||
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): | |||||
self.tree_class = tree_class | self.tree_class = tree_class | ||||
self.propagate_positions = propagate_positions | self.propagate_positions = propagate_positions | ||||
self.always_keep_all_tokens = keep_all_tokens | self.always_keep_all_tokens = keep_all_tokens | ||||
self.ambiguous = ambiguous | |||||
self.rule_builders = list(self._init_builders(rules)) | self.rule_builders = list(self._init_builders(rules)) | ||||
@@ -107,7 +121,7 @@ class ParseTreeBuilder: | |||||
wrapper_chain = filter(None, [ | wrapper_chain = filter(None, [ | ||||
create_token and partial(CreateToken, create_token), | create_token and partial(CreateToken, create_token), | ||||
(expand_single_child and not rule.alias) and ExpandSingleChild, | (expand_single_child and not rule.alias) and ExpandSingleChild, | ||||
maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out), | |||||
maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out, self.ambiguous), | |||||
self.propagate_positions and PropagatePositions, | self.propagate_positions and PropagatePositions, | ||||
]) | ]) | ||||
@@ -15,9 +15,9 @@ class WithLexer: | |||||
def init_contextual_lexer(self, lexer_conf, parser_conf): | def init_contextual_lexer(self, lexer_conf, parser_conf): | ||||
self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} | |||||
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} | |||||
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | ||||
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) | |||||
self.lexer = ContextualLexer(lexer_conf.tokens, states, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) | |||||
def lex(self, text): | def lex(self, text): | ||||
stream = self.lexer.lex(text) | stream = self.lexer.lex(text) | ||||
@@ -145,16 +145,16 @@ class Column: | |||||
class Parser: | class Parser: | ||||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | ||||
self.analysis = GrammarAnalyzer(parser_conf) | |||||
analysis = GrammarAnalyzer(parser_conf) | |||||
self.parser_conf = parser_conf | self.parser_conf = parser_conf | ||||
self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
self.FIRST = self.analysis.FIRST | |||||
self.FIRST = analysis.FIRST | |||||
self.postprocess = {} | self.postprocess = {} | ||||
self.predictions = {} | self.predictions = {} | ||||
for rule in parser_conf.rules: | for rule in parser_conf.rules: | ||||
self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | ||||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||||
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | |||||
self.term_matcher = term_matcher | self.term_matcher = term_matcher | ||||
@@ -5,6 +5,8 @@ from ..grammar import Rule | |||||
class RulePtr(object): | class RulePtr(object): | ||||
__slots__ = ('rule', 'index') | |||||
def __init__(self, rule, index): | def __init__(self, rule, index): | ||||
assert isinstance(rule, Rule) | assert isinstance(rule, Rule) | ||||
assert index <= len(rule.expansion) | assert index <= len(rule.expansion) | ||||
@@ -134,7 +136,8 @@ class GrammarAnalyzer(object): | |||||
if not is_terminal(new_r): | if not is_terminal(new_r): | ||||
yield new_r | yield new_r | ||||
_ = list(bfs([rule], _expand_rule)) | |||||
for _ in bfs([rule], _expand_rule): | |||||
pass | |||||
return fzset(init_ptrs) | return fzset(init_ptrs) | ||||
@@ -2,7 +2,6 @@ | |||||
""" | """ | ||||
# Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
# Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
from ..common import UnexpectedToken | from ..common import UnexpectedToken | ||||
from .lalr_analysis import LALR_Analyzer, Shift | from .lalr_analysis import LALR_Analyzer, Shift | ||||
@@ -11,11 +10,12 @@ class Parser: | |||||
def __init__(self, parser_conf): | def __init__(self, parser_conf): | ||||
assert all(r.options is None or r.options.priority is None | assert all(r.options is None or r.options.priority is None | ||||
for r in parser_conf.rules), "LALR doesn't yet support prioritization" | for r in parser_conf.rules), "LALR doesn't yet support prioritization" | ||||
self.analysis = analysis = LALR_Analyzer(parser_conf) | |||||
analysis = LALR_Analyzer(parser_conf) | |||||
analysis.compute_lookahead() | analysis.compute_lookahead() | ||||
callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | ||||
for rule in parser_conf.rules} | for rule in parser_conf.rules} | ||||
self._parse_table = analysis.parse_table | |||||
self.parser_conf = parser_conf | self.parser_conf = parser_conf | ||||
self.parser = _Parser(analysis.parse_table, callbacks) | self.parser = _Parser(analysis.parse_table, callbacks) | ||||
self.parse = self.parser.parse | self.parse = self.parser.parse | ||||
@@ -46,8 +46,7 @@ class _Parser: | |||||
return states[state][key] | return states[state][key] | ||||
except KeyError: | except KeyError: | ||||
expected = states[state].keys() | expected = states[state].keys() | ||||
raise UnexpectedToken(token, expected, seq, i) | |||||
raise UnexpectedToken(token, expected, seq, i, state=state) | |||||
def reduce(rule): | def reduce(rule): | ||||
size = len(rule.expansion) | size = len(rule.expansion) | ||||
@@ -9,11 +9,7 @@ from ..tree import Tree, Visitor_NoRecurse | |||||
# Author: Erez Sh | # Author: Erez Sh | ||||
def _compare_rules(rule1, rule2): | def _compare_rules(rule1, rule2): | ||||
c = -compare( len(rule1.expansion), len(rule2.expansion)) | |||||
if rule1.origin.startswith('__'): # XXX hack! We should set priority in parser, not here | |||||
c = -c | |||||
return c | |||||
return -compare( len(rule1.expansion), len(rule2.expansion)) | |||||
def _sum_priority(tree): | def _sum_priority(tree): | ||||
p = 0 | p = 0 | ||||
@@ -126,7 +126,7 @@ def _get_token_type(token_type): | |||||
class ParserAtoms: | class ParserAtoms: | ||||
def __init__(self, parser): | def __init__(self, parser): | ||||
self.parse_table = parser.analysis.parse_table | |||||
self.parse_table = parser._parse_table | |||||
def print_python(self): | def print_python(self): | ||||
print('class ParseTable: pass') | print('class ParseTable: pass') | ||||
@@ -99,6 +99,8 @@ class Tree(object): | |||||
self.data = data | self.data = data | ||||
self.children = children | self.children = children | ||||
class SlottedTree(Tree): | |||||
__slots__ = 'data', 'children', 'rule' | |||||
###{standalone | ###{standalone | ||||
@@ -172,6 +174,30 @@ class Visitor_NoRecurse(Visitor): | |||||
return tree | return tree | ||||
from functools import wraps | |||||
def visit_children_decor(func): | |||||
@wraps(func) | |||||
def inner(cls, tree): | |||||
values = cls.visit_children(tree) | |||||
return func(cls, values) | |||||
return inner | |||||
class Interpreter(object): | |||||
def visit(self, tree): | |||||
return getattr(self, tree.data)(tree) | |||||
def visit_children(self, tree): | |||||
return [self.visit(child) if isinstance(child, Tree) else child | |||||
for child in tree.children] | |||||
def __getattr__(self, name): | |||||
return self.__default__ | |||||
def __default__(self, tree): | |||||
return self.visit_children(tree) | |||||
class Transformer_NoRecurse(Transformer): | class Transformer_NoRecurse(Transformer): | ||||
def transform(self, tree): | def transform(self, tree): | ||||
subtrees = list(tree.iter_subtrees()) | subtrees = list(tree.iter_subtrees()) | ||||
@@ -187,17 +187,22 @@ def _make_full_earley_test(LEXER): | |||||
l.parse(program) | l.parse(program) | ||||
def test_earley_scanless3(self): | |||||
"Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||||
# XXX Fails for scanless mode | |||||
# XXX Decided not to fix, because | |||||
# a) It's a subtle bug | |||||
# b) Scanless is intended for deprecation | |||||
# | |||||
# def test_earley_scanless3(self): | |||||
# "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||||
grammar = """ | |||||
start: A A | |||||
A: "a"+ | |||||
""" | |||||
# grammar = """ | |||||
# start: A A | |||||
# A: "a"+ | |||||
# """ | |||||
l = Lark(grammar, parser='earley', lexer=LEXER) | |||||
res = l.parse("aaa") | |||||
self.assertEqual(res.children, ['aa', 'a']) | |||||
# l = Lark(grammar, parser='earley', lexer=LEXER) | |||||
# res = l.parse("aaa") | |||||
# self.assertEqual(res.children, ['aa', 'a']) | |||||
def test_earley_scanless4(self): | def test_earley_scanless4(self): | ||||
grammar = """ | grammar = """ | ||||
@@ -293,6 +298,39 @@ def _make_full_earley_test(LEXER): | |||||
self.assertEqual(res, expected) | self.assertEqual(res, expected) | ||||
def test_explicit_ambiguity2(self): | |||||
grammar = r""" | |||||
start: NAME+ | |||||
NAME: /\w+/ | |||||
%ignore " " | |||||
""" | |||||
text = """cat""" | |||||
parser = Lark(grammar, start='start', ambiguity='explicit') | |||||
tree = parser.parse(text) | |||||
self.assertEqual(tree.data, '_ambig') | |||||
combinations = {tuple(str(s) for s in t.children) for t in tree.children} | |||||
self.assertEqual(combinations, { | |||||
('cat',), | |||||
('ca', 't'), | |||||
('c', 'at'), | |||||
('c', 'a' ,'t') | |||||
}) | |||||
def test_term_ambig_resolve(self): | |||||
grammar = r""" | |||||
!start: NAME+ | |||||
NAME: /\w+/ | |||||
%ignore " " | |||||
""" | |||||
text = """foo bar""" | |||||
parser = Lark(grammar) | |||||
tree = parser.parse(text) | |||||
self.assertEqual(tree.children, ['foo', 'bar']) | |||||
@@ -822,6 +860,12 @@ def _make_parser_test(LEXER, PARSER): | |||||
""" | """ | ||||
self.assertRaises( GrammarError, _Lark, g) | self.assertRaises( GrammarError, _Lark, g) | ||||
def test_alias_in_terminal(self): | |||||
g = """start: TERM | |||||
TERM: "a" -> alias | |||||
""" | |||||
self.assertRaises( GrammarError, _Lark, g) | |||||
@unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO | @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO | ||||
def test_line_and_column(self): | def test_line_and_column(self): | ||||
g = r"""!start: "A" bc "D" | g = r"""!start: "A" bc "D" | ||||
@@ -1129,6 +1173,18 @@ def _make_parser_test(LEXER, PARSER): | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') | self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') | ||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | ||||
@unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX | |||||
def test_priority_vs_embedded(self): | |||||
g = """ | |||||
A.2: "a" | |||||
WORD: ("a".."z")+ | |||||
start: (A | WORD)+ | |||||
""" | |||||
l = _Lark(g) | |||||
t = l.parse('abc') | |||||
self.assertEqual(t.children, ['a', 'bc']) | |||||
self.assertEqual(t.children[0].type, 'A') | |||||
@@ -5,7 +5,7 @@ from unittest import TestCase | |||||
import copy | import copy | ||||
import pickle | import pickle | ||||
from lark.tree import Tree | |||||
from lark.tree import Tree, Interpreter, visit_children_decor | |||||
class TestTrees(TestCase): | class TestTrees(TestCase): | ||||
@@ -21,6 +21,45 @@ class TestTrees(TestCase): | |||||
assert pickle.loads(data) == s | assert pickle.loads(data) == s | ||||
def test_interp(self): | |||||
t = Tree('a', [Tree('b', []), Tree('c', []), 'd']) | |||||
class Interp1(Interpreter): | |||||
def a(self, tree): | |||||
return self.visit_children(tree) + ['e'] | |||||
def b(self, tree): | |||||
return 'B' | |||||
def c(self, tree): | |||||
return 'C' | |||||
self.assertEqual(Interp1().visit(t), list('BCde')) | |||||
class Interp2(Interpreter): | |||||
@visit_children_decor | |||||
def a(self, values): | |||||
return values + ['e'] | |||||
def b(self, tree): | |||||
return 'B' | |||||
def c(self, tree): | |||||
return 'C' | |||||
self.assertEqual(Interp2().visit(t), list('BCde')) | |||||
class Interp3(Interpreter): | |||||
def b(self, tree): | |||||
return 'B' | |||||
def c(self, tree): | |||||
return 'C' | |||||
self.assertEqual(Interp3().visit(t), list('BCd')) | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
unittest.main() | unittest.main() | ||||