@@ -125,7 +125,8 @@ Lark has no dependencies. | |||||
### Projects using Lark | ### Projects using Lark | ||||
- [mappyfile](https://github.com/geographika/mappyfile) - A pure Python MapFile parser for working with MapServer | |||||
- [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration | |||||
- [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer | |||||
Using Lark? Send me a message and I'll add your project! | Using Lark? Send me a message and I'll add your project! | ||||
@@ -251,6 +252,22 @@ Lark offers both Earley and LALR(1), which means you can choose between the most | |||||
Lark uses the [MIT license](LICENSE). | Lark uses the [MIT license](LICENSE). | ||||
## Contribute | |||||
Lark is currently accepting pull-requests. | |||||
There are many ways you can help the project: | |||||
* Improve the performance of Lark's parsing algorithm | |||||
* Implement macros for grammars (important for grammar composition) | |||||
* Write new grammars for Lark's library | |||||
* Write & improve the documentation | |||||
* Write a blog post introducing Lark to your audience | |||||
If you're interested in taking one of these on, let me know and I will provide more details and assist you in the process. | |||||
## Contact | ## Contact | ||||
If you have any questions or want to contribute, you can email me at erezshin at gmail com. | |||||
If you have any questions or want my assistance, you can email me at erezshin at gmail com. | |||||
I'm also available for contract work. |
@@ -116,11 +116,12 @@ AWAIT: "await" | |||||
| atom_expr "." NAME -> getattr | | atom_expr "." NAME -> getattr | ||||
| atom | | atom | ||||
?atom: "(" [yield_expr|testlist_comp] ")" | |||||
| "[" [testlist_comp] "]" | |||||
| "{" [dictorsetmaker] "}" | |||||
?atom: "(" [yield_expr|testlist_comp] ")" -> tuple | |||||
| "[" [testlist_comp] "]" -> list | |||||
| "{" [dictorsetmaker] "}" -> dict | |||||
| NAME -> var | | NAME -> var | ||||
| number | string+ | "..." | |||||
| number | string+ | |||||
| "..." -> ellipsis | |||||
| "None" -> const_none | | "None" -> const_none | ||||
| "True" -> const_true | | "True" -> const_true | ||||
| "False" -> const_false | | "False" -> const_false | ||||
@@ -3,4 +3,4 @@ from .common import ParseError, GrammarError | |||||
from .lark import Lark | from .lark import Lark | ||||
from .utils import inline_args | from .utils import inline_args | ||||
__version__ = "0.2.7" | |||||
__version__ = "0.2.10" |
@@ -1,4 +1,5 @@ | |||||
import re | import re | ||||
import sre_parse | |||||
class GrammarError(Exception): | class GrammarError(Exception): | ||||
pass | pass | ||||
@@ -40,7 +41,7 @@ class LexerConf: | |||||
class ParserConf: | class ParserConf: | ||||
def __init__(self, rules, callback, start): | def __init__(self, rules, callback, start): | ||||
assert all(len(r)==3 for r in rules) | |||||
assert all(len(r) == 4 for r in rules) | |||||
self.rules = rules | self.rules = rules | ||||
self.callback = callback | self.callback = callback | ||||
self.start = start | self.start = start | ||||
@@ -57,9 +58,9 @@ class Pattern(object): | |||||
# Pattern Hashing assumes all subclasses have a different priority! | # Pattern Hashing assumes all subclasses have a different priority! | ||||
def __hash__(self): | def __hash__(self): | ||||
return hash((self.priority, self.value)) | |||||
return hash((type(self), self.value)) | |||||
def __eq__(self, other): | def __eq__(self, other): | ||||
return self.priority == other.priority and self.value == other.value | |||||
return type(self) == type(other) and self.value == other.value | |||||
def _get_flags(self): | def _get_flags(self): | ||||
if self.flags: | if self.flags: | ||||
@@ -71,13 +72,21 @@ class PatternStr(Pattern): | |||||
def to_regexp(self): | def to_regexp(self): | ||||
return self._get_flags() + re.escape(self.value) | return self._get_flags() + re.escape(self.value) | ||||
priority = 0 | |||||
@property | |||||
def min_width(self): | |||||
return len(self.value) | |||||
max_width = min_width | |||||
class PatternRE(Pattern): | class PatternRE(Pattern): | ||||
def to_regexp(self): | def to_regexp(self): | ||||
return self._get_flags() + self.value | return self._get_flags() + self.value | ||||
priority = 1 | |||||
@property | |||||
def min_width(self): | |||||
return sre_parse.parse(self.to_regexp()).getwidth()[0] | |||||
@property | |||||
def max_width(self): | |||||
return sre_parse.parse(self.to_regexp()).getwidth()[1] | |||||
class TokenDef(object): | class TokenDef(object): | ||||
def __init__(self, name, pattern): | def __init__(self, name, pattern): | ||||
@@ -39,6 +39,7 @@ class LarkOptions(object): | |||||
postlex - Lexer post-processing (Default: None) | postlex - Lexer post-processing (Default: None) | ||||
start - The start symbol (Default: start) | start - The start symbol (Default: start) | ||||
profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) | profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) | ||||
propagate_positions - Experimental. Don't use yet. | |||||
""" | """ | ||||
__doc__ += OPTIONS_DOC | __doc__ += OPTIONS_DOC | ||||
def __init__(self, options_dict): | def __init__(self, options_dict): | ||||
@@ -55,14 +56,13 @@ class LarkOptions(object): | |||||
self.start = o.pop('start', 'start') | self.start = o.pop('start', 'start') | ||||
self.profile = o.pop('profile', False) | self.profile = o.pop('profile', False) | ||||
self.ambiguity = o.pop('ambiguity', 'auto') | self.ambiguity = o.pop('ambiguity', 'auto') | ||||
self.propagate_positions = o.pop('propagate_positions', False) | |||||
assert self.parser in ('earley', 'lalr', None) | assert self.parser in ('earley', 'lalr', None) | ||||
if self.parser == 'earley' and self.transformer: | if self.parser == 'earley' and self.transformer: | ||||
raise ValueError('Cannot specify an auto-transformer when using the Earley algorithm.' | raise ValueError('Cannot specify an auto-transformer when using the Earley algorithm.' | ||||
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)') | 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)') | ||||
if self.keep_all_tokens: | |||||
raise NotImplementedError("keep_all_tokens: Not implemented yet!") | |||||
if o: | if o: | ||||
raise ValueError("Unknown options: %s" % o.keys()) | raise ValueError("Unknown options: %s" % o.keys()) | ||||
@@ -119,7 +119,7 @@ class Lark: | |||||
assert isinstance(grammar, STRING_TYPE) | assert isinstance(grammar, STRING_TYPE) | ||||
if self.options.cache_grammar or self.options.keep_all_tokens: | |||||
if self.options.cache_grammar: | |||||
raise NotImplementedError("Not available yet") | raise NotImplementedError("Not available yet") | ||||
assert not self.options.profile, "Feature temporarily disabled" | assert not self.options.profile, "Feature temporarily disabled" | ||||
@@ -142,8 +142,12 @@ class Lark: | |||||
assert self.options.parser == 'earley' | assert self.options.parser == 'earley' | ||||
assert self.options.ambiguity in ('resolve', 'explicit', 'auto') | assert self.options.ambiguity in ('resolve', 'explicit', 'auto') | ||||
# Parse the grammar file and compose the grammars (TODO) | |||||
self.grammar = load_grammar(grammar, source) | self.grammar = load_grammar(grammar, source) | ||||
# Compile the EBNF grammar into BNF | |||||
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer), start=self.options.start) | tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer), start=self.options.start) | ||||
self.ignore_tokens = self.grammar.extra['ignore'] | self.ignore_tokens = self.grammar.extra['ignore'] | ||||
self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex) | self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex) | ||||
@@ -162,7 +166,7 @@ class Lark: | |||||
def _build_parser(self): | def _build_parser(self): | ||||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | |||||
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||||
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | ||||
if self.profiler: | if self.profiler: | ||||
for f in dir(callback): | for f in dir(callback): | ||||
@@ -1,7 +1,6 @@ | |||||
## Lexer Implementation | ## Lexer Implementation | ||||
import re | import re | ||||
import sre_parse | |||||
from .utils import Str, classify | from .utils import Str, classify | ||||
from .common import is_terminal, PatternStr, PatternRE, TokenDef | from .common import is_terminal, PatternStr, PatternRE, TokenDef | ||||
@@ -120,8 +119,7 @@ class Lexer(object): | |||||
except: | except: | ||||
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) | raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) | ||||
width = sre_parse.parse(t.pattern.to_regexp()).getwidth() | |||||
if width[0] == 0: | |||||
if t.pattern.min_width == 0: | |||||
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) | raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) | ||||
token_names = {t.name for t in tokens} | token_names = {t.name for t in tokens} | ||||
@@ -133,7 +131,7 @@ class Lexer(object): | |||||
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] | self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] | ||||
self.ignore_types = [t for t in ignore] | self.ignore_types = [t for t in ignore] | ||||
tokens.sort(key=lambda x:(x.pattern.priority, len(x.pattern.value)), reverse=True) | |||||
tokens.sort(key=lambda x:x.pattern.max_width, reverse=True) | |||||
tokens, self.callback = _create_unless(tokens) | tokens, self.callback = _create_unless(tokens) | ||||
assert all(self.callback.values()) | assert all(self.callback.values()) | ||||
@@ -155,17 +153,27 @@ class Lexer(object): | |||||
if m: | if m: | ||||
value = m.group(0) | value = m.group(0) | ||||
type_ = type_from_index[m.lastindex] | type_ = type_from_index[m.lastindex] | ||||
if type_ not in ignore_types: | |||||
to_yield = type_ not in ignore_types | |||||
if to_yield: | |||||
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) | t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) | ||||
end_col = t.column + len(value) | |||||
if t.type in self.callback: | if t.type in self.callback: | ||||
t = self.callback[t.type](t) | t = self.callback[t.type](t) | ||||
yield t | |||||
if type_ in newline_types: | if type_ in newline_types: | ||||
newlines = value.count(self.newline_char) | newlines = value.count(self.newline_char) | ||||
if newlines: | if newlines: | ||||
line += newlines | line += newlines | ||||
col_start_pos = lex_pos + value.rindex(self.newline_char) | |||||
last_newline_index = value.rindex(self.newline_char) + 1 | |||||
col_start_pos = lex_pos + last_newline_index | |||||
end_col = len(value) - last_newline_index | |||||
if to_yield: | |||||
t.end_line = line | |||||
t.end_col = end_col | |||||
yield t | |||||
lex_pos += len(value) | lex_pos += len(value) | ||||
break | break | ||||
else: | else: | ||||
@@ -75,6 +75,7 @@ TOKENS = { | |||||
'_TO': '->', | '_TO': '->', | ||||
'_IGNORE': r'%ignore', | '_IGNORE': r'%ignore', | ||||
'_IMPORT': r'%import', | '_IMPORT': r'%import', | ||||
'NUMBER': '\d+', | |||||
} | } | ||||
RULES = { | RULES = { | ||||
@@ -82,7 +83,8 @@ RULES = { | |||||
'_list': ['_item', '_list _item'], | '_list': ['_item', '_list _item'], | ||||
'_item': ['rule', 'token', 'statement', '_NL'], | '_item': ['rule', 'token', 'statement', '_NL'], | ||||
'rule': ['RULE _COLON expansions _NL'], | |||||
'rule': ['RULE _COLON expansions _NL', | |||||
'RULE _DOT NUMBER _COLON expansions _NL'], | |||||
'expansions': ['alias', | 'expansions': ['alias', | ||||
'expansions _OR alias', | 'expansions _OR alias', | ||||
'expansions _NL _OR alias'], | 'expansions _NL _OR alias'], | ||||
@@ -313,7 +315,7 @@ class PrepareLiterals(InlineTransformer): | |||||
class SplitLiterals(InlineTransformer): | class SplitLiterals(InlineTransformer): | ||||
def pattern(self, p): | def pattern(self, p): | ||||
if isinstance(p, PatternStr) and len(p.value)>1: | if isinstance(p, PatternStr) and len(p.value)>1: | ||||
return T('expansion', [T('pattern', [PatternStr(ch)]) for ch in p.value]) | |||||
return T('expansion', [T('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) | |||||
return T('pattern', [p]) | return T('pattern', [p]) | ||||
class TokenTreeToPattern(Transformer): | class TokenTreeToPattern(Transformer): | ||||
@@ -470,21 +472,29 @@ class Grammar: | |||||
class RuleOptions: | class RuleOptions: | ||||
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False): | |||||
def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): | |||||
self.keep_all_tokens = keep_all_tokens | self.keep_all_tokens = keep_all_tokens | ||||
self.expand1 = expand1 | self.expand1 = expand1 | ||||
self.create_token = create_token # used for scanless postprocessing | self.create_token = create_token # used for scanless postprocessing | ||||
self.priority = priority | |||||
self.filter_out = filter_out # remove this rule from the tree | self.filter_out = filter_out # remove this rule from the tree | ||||
# used for "token"-rules in scanless | # used for "token"-rules in scanless | ||||
@classmethod | @classmethod | ||||
def from_rule(cls, name, expansions): | |||||
def from_rule(cls, name, *x): | |||||
if len(x) > 1: | |||||
priority, expansions = x | |||||
priority = int(priority) | |||||
else: | |||||
expansions ,= x | |||||
priority = None | |||||
keep_all_tokens = name.startswith('!') | keep_all_tokens = name.startswith('!') | ||||
name = name.lstrip('!') | name = name.lstrip('!') | ||||
expand1 = name.startswith('?') | expand1 = name.startswith('?') | ||||
name = name.lstrip('?') | name = name.lstrip('?') | ||||
return name, expansions, cls(keep_all_tokens, expand1) | |||||
return name, expansions, cls(keep_all_tokens, expand1, priority=priority) | |||||
@@ -605,7 +615,7 @@ class GrammarLoader: | |||||
raise GrammarError("Token '%s' defined more than once" % name) | raise GrammarError("Token '%s' defined more than once" % name) | ||||
token_names.add(name) | token_names.add(name) | ||||
rules = [RuleOptions.from_rule(name, x) for name, x in rule_defs] | |||||
rules = [RuleOptions.from_rule(*x) for x in rule_defs] | |||||
rule_names = set() | rule_names = set() | ||||
for name, _x, _o in rules: | for name, _x, _o in rules: | ||||
@@ -1,4 +1,5 @@ | |||||
from .common import is_terminal, GrammarError | from .common import is_terminal, GrammarError | ||||
from .utils import suppress | |||||
from .lexer import Token | from .lexer import Token | ||||
class Callback(object): | class Callback(object): | ||||
@@ -42,10 +43,32 @@ def create_rule_handler(expansion, usermethod, keep_all_tokens, filter_out): | |||||
# else, if no filtering required.. | # else, if no filtering required.. | ||||
return usermethod | return usermethod | ||||
def propagate_positions_wrapper(f): | |||||
def _f(args): | |||||
res = f(args) | |||||
if args: | |||||
for a in args: | |||||
with suppress(AttributeError): | |||||
res.line = a.line | |||||
res.column = a.column | |||||
break | |||||
for a in reversed(args): | |||||
with suppress(AttributeError): | |||||
res.end_line = a.end_line | |||||
res.end_col = a.end_col | |||||
break | |||||
return res | |||||
return _f | |||||
class ParseTreeBuilder: | class ParseTreeBuilder: | ||||
def __init__(self, tree_class): | |||||
def __init__(self, tree_class, propagate_positions=False, keep_all_tokens=False): | |||||
self.tree_class = tree_class | self.tree_class = tree_class | ||||
self.propagate_positions = propagate_positions | |||||
self.always_keep_all_tokens = keep_all_tokens | |||||
def _create_tree_builder_function(self, name): | def _create_tree_builder_function(self, name): | ||||
tree_class = self.tree_class | tree_class = self.tree_class | ||||
@@ -66,7 +89,7 @@ class ParseTreeBuilder: | |||||
filter_out.add(origin) | filter_out.add(origin) | ||||
for origin, (expansions, options) in rules.items(): | for origin, (expansions, options) in rules.items(): | ||||
keep_all_tokens = options.keep_all_tokens if options else False | |||||
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) | |||||
expand1 = options.expand1 if options else False | expand1 = options.expand1 if options else False | ||||
create_token = options.create_token if options else False | create_token = options.create_token if options else False | ||||
@@ -92,11 +115,14 @@ class ParseTreeBuilder: | |||||
alias_handler = create_rule_handler(expansion, f, keep_all_tokens, filter_out) | alias_handler = create_rule_handler(expansion, f, keep_all_tokens, filter_out) | ||||
if self.propagate_positions: | |||||
alias_handler = propagate_positions_wrapper(alias_handler) | |||||
callback_name = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | callback_name = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | ||||
if hasattr(callback, callback_name): | if hasattr(callback, callback_name): | ||||
raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) | raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) | ||||
setattr(callback, callback_name, alias_handler) | setattr(callback, callback_name, alias_handler) | ||||
new_rules.append(( _origin, expansion, callback_name )) | |||||
new_rules.append(( _origin, expansion, callback_name, options )) | |||||
return new_rules, callback | return new_rules, callback |
@@ -131,7 +131,7 @@ class Earley_NoLex: | |||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | self.token_by_name = {t.name:t for t in lexer_conf.tokens} | ||||
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||||
rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | |||||
resolve_ambiguity = (options.ambiguity=='resolve') if options else True | resolve_ambiguity = (options.ambiguity=='resolve') if options else True | ||||
self.parser = earley.Parser(rules, | self.parser = earley.Parser(rules, | ||||
@@ -158,7 +158,7 @@ class Earley(WithLexer): | |||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
WithLexer.__init__(self, lexer_conf) | WithLexer.__init__(self, lexer_conf) | ||||
rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | |||||
rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules] | |||||
resolve_ambiguity = (options.ambiguity=='resolve') if options else True | resolve_ambiguity = (options.ambiguity=='resolve') if options else True | ||||
self.parser = earley.Parser(rules, | self.parser = earley.Parser(rules, | ||||
@@ -29,6 +29,9 @@ class Derivation(Tree): | |||||
Tree.__init__(self, 'drv', items or []) | Tree.__init__(self, 'drv', items or []) | ||||
self.rule = rule | self.rule = rule | ||||
def _pretty_label(self): # Nicer pretty for debugging the parser | |||||
return self.rule.origin if self.rule else self.data | |||||
END_TOKEN = EndToken() | END_TOKEN = EndToken() | ||||
class Item(object): | class Item(object): | ||||
@@ -106,8 +109,11 @@ class Column: | |||||
new_tree = old_tree.copy() | new_tree = old_tree.copy() | ||||
new_tree.rule = old_tree.rule | new_tree.rule = old_tree.rule | ||||
old_tree.set('_ambig', [new_tree]) | old_tree.set('_ambig', [new_tree]) | ||||
old_tree.rule = None # No longer a 'drv' node | |||||
if item.tree.children[0] is old_tree: # XXX a little hacky! | if item.tree.children[0] is old_tree: # XXX a little hacky! | ||||
raise ParseError("Infinite recursion in grammar!") | |||||
raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule) | |||||
old_tree.children.append(item.tree) | old_tree.children.append(item.tree) | ||||
else: | else: | ||||
self.completed[item] = item | self.completed[item] = item | ||||
@@ -218,7 +224,13 @@ class ApplyCallbacks(Transformer_NoRecurse): | |||||
return Tree(rule.origin, children) | return Tree(rule.origin, children) | ||||
def _compare_rules(rule1, rule2): | def _compare_rules(rule1, rule2): | ||||
assert rule1.origin == rule2.origin | |||||
if rule1.options and rule2.options: | |||||
if rule1.options.priority is not None and rule2.options.priority is not None: | |||||
assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2) | |||||
return -compare(rule1.options.priority, rule2.options.priority) | |||||
if rule1.origin != rule2.origin: | |||||
return 0 | |||||
c = compare( len(rule1.expansion), len(rule2.expansion)) | c = compare( len(rule1.expansion), len(rule2.expansion)) | ||||
if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | ||||
c = -c | c = -c | ||||
@@ -228,6 +240,20 @@ def _compare_drv(tree1, tree2): | |||||
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | ||||
return compare(tree1, tree2) | return compare(tree1, tree2) | ||||
try: | |||||
rule1, rule2 = tree1.rule, tree2.rule | |||||
except AttributeError: | |||||
# Probably trees that don't take part in this parse (better way to distinguish?) | |||||
return compare(tree1, tree2) | |||||
# XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, | |||||
# when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be | |||||
# computationally inefficient. So we handle it here. | |||||
if tree1.data == '_ambig': | |||||
_resolve_ambig(tree1) | |||||
if tree2.data == '_ambig': | |||||
_resolve_ambig(tree2) | |||||
c = _compare_rules(tree1.rule, tree2.rule) | c = _compare_rules(tree1.rule, tree2.rule) | ||||
if c: | if c: | ||||
return c | return c | ||||
@@ -241,12 +267,19 @@ def _compare_drv(tree1, tree2): | |||||
return compare(len(tree1.children), len(tree2.children)) | return compare(len(tree1.children), len(tree2.children)) | ||||
def _resolve_ambig(tree): | |||||
assert tree.data == '_ambig' | |||||
best = min(tree.children, key=cmp_to_key(_compare_drv)) | |||||
assert best.data == 'drv' | |||||
tree.set('drv', best.children) | |||||
tree.rule = best.rule # needed for applying callbacks | |||||
assert tree.data != '_ambig' | |||||
class ResolveAmbig(Visitor_NoRecurse): | class ResolveAmbig(Visitor_NoRecurse): | ||||
def _ambig(self, tree): | def _ambig(self, tree): | ||||
best = min(tree.children, key=cmp_to_key(_compare_drv)) | |||||
assert best.data == 'drv' | |||||
tree.set('drv', best.children) | |||||
tree.rule = best.rule # needed for applying callbacks | |||||
_resolve_ambig(tree) | |||||
# RULES = [ | # RULES = [ | ||||
@@ -7,10 +7,11 @@ class Rule(object): | |||||
origin : a symbol | origin : a symbol | ||||
expansion : a list of symbols | expansion : a list of symbols | ||||
""" | """ | ||||
def __init__(self, origin, expansion, alias=None): | |||||
def __init__(self, origin, expansion, alias=None, options=None): | |||||
self.origin = origin | self.origin = origin | ||||
self.expansion = expansion | self.expansion = expansion | ||||
self.alias = alias | self.alias = alias | ||||
self.options = options | |||||
def __repr__(self): | def __repr__(self): | ||||
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | ||||
@@ -111,12 +112,12 @@ class GrammarAnalyzer(object): | |||||
self.debug = debug | self.debug = debug | ||||
rule_tuples = list(rule_tuples) | rule_tuples = list(rule_tuples) | ||||
rule_tuples.append(('$root', [start_symbol, '$end'])) | rule_tuples.append(('$root', [start_symbol, '$end'])) | ||||
rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] | |||||
rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples] | |||||
self.rules = set() | self.rules = set() | ||||
self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} | |||||
for origin, exp, alias in rule_tuples: | |||||
r = Rule( origin, exp, alias ) | |||||
self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples} | |||||
for origin, exp, alias, options in rule_tuples: | |||||
r = Rule( origin, exp, alias, options ) | |||||
self.rules.add(r) | self.rules.add(r) | ||||
self.rules_by_origin[origin].append(r) | self.rules_by_origin[origin].append(r) | ||||
@@ -9,6 +9,7 @@ from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT | |||||
class Parser(object): | class Parser(object): | ||||
def __init__(self, parser_conf): | def __init__(self, parser_conf): | ||||
assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" | |||||
self.analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) | self.analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) | ||||
self.analysis.compute_lookahead() | self.analysis.compute_lookahead() | ||||
self.callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | self.callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | ||||
@@ -34,7 +35,7 @@ class Parser(object): | |||||
raise UnexpectedToken(token, expected, seq, i) | raise UnexpectedToken(token, expected, seq, i) | ||||
def reduce(rule, size): | |||||
def reduce(rule, size, end=False): | |||||
if size: | if size: | ||||
s = value_stack[-size:] | s = value_stack[-size:] | ||||
del state_stack[-size:] | del state_stack[-size:] | ||||
@@ -44,7 +45,7 @@ class Parser(object): | |||||
res = self.callbacks[rule](s) | res = self.callbacks[rule](s) | ||||
if len(state_stack) == 1 and rule.origin == self.analysis.start_symbol: | |||||
if end and len(state_stack) == 1 and rule.origin == self.analysis.start_symbol: | |||||
return res | return res | ||||
_action, new_state = get_action(rule.origin) | _action, new_state = get_action(rule.origin) | ||||
@@ -73,7 +74,7 @@ class Parser(object): | |||||
while True: | while True: | ||||
_action, rule = get_action('$end') | _action, rule = get_action('$end') | ||||
assert _action == 'reduce' | assert _action == 'reduce' | ||||
res = reduce(*rule) | |||||
res = reduce(*rule, end=True) | |||||
if res: | if res: | ||||
assert state_stack == [self.analysis.init_state_idx] and not value_stack, len(state_stack) | assert state_stack == [self.analysis.init_state_idx] and not value_stack, len(state_stack) | ||||
return res | return res | ||||
@@ -10,11 +10,14 @@ class Tree(object): | |||||
def __repr__(self): | def __repr__(self): | ||||
return 'Tree(%s, %s)' % (self.data, self.children) | return 'Tree(%s, %s)' % (self.data, self.children) | ||||
def _pretty_label(self): | |||||
return self.data | |||||
def _pretty(self, level, indent_str): | def _pretty(self, level, indent_str): | ||||
if len(self.children) == 1 and not isinstance(self.children[0], Tree): | if len(self.children) == 1 and not isinstance(self.children[0], Tree): | ||||
return [ indent_str*level, self.data, '\t', '%s' % self.children[0], '\n'] | return [ indent_str*level, self.data, '\t', '%s' % self.children[0], '\n'] | ||||
l = [ indent_str*level, self.data, '\n' ] | |||||
l = [ indent_str*level, self._pretty_label(), '\n' ] | |||||
for n in self.children: | for n in self.children: | ||||
if isinstance(n, Tree): | if isinstance(n, Tree): | ||||
l += n._pretty(level+1, indent_str) | l += n._pretty(level+1, indent_str) | ||||
@@ -62,10 +65,14 @@ class Tree(object): | |||||
yield c | yield c | ||||
def iter_subtrees(self): | def iter_subtrees(self): | ||||
visited = set() | |||||
q = [self] | q = [self] | ||||
while q: | while q: | ||||
subtree = q.pop() | subtree = q.pop() | ||||
if id(subtree) in visited: | |||||
continue # already been here from another branch | |||||
visited.add(id(subtree)) | |||||
yield subtree | yield subtree | ||||
q += [c for c in subtree.children if isinstance(c, Tree)] | q += [c for c in subtree.children if isinstance(c, Tree)] | ||||
@@ -1,6 +1,7 @@ | |||||
import functools | import functools | ||||
import types | import types | ||||
from collections import deque | from collections import deque | ||||
from contextlib import contextmanager | |||||
class fzset(frozenset): | class fzset(frozenset): | ||||
def __repr__(self): | def __repr__(self): | ||||
@@ -63,11 +64,17 @@ def inline_args(f): | |||||
def _f_builtin(_self, args): | def _f_builtin(_self, args): | ||||
return f(*args) | return f(*args) | ||||
return _f_builtin | return _f_builtin | ||||
else: | |||||
@functools.wraps(f) | |||||
elif isinstance(f, types.MethodType): | |||||
@functools.wraps(f.__func__) | |||||
def _f(self, args): | def _f(self, args): | ||||
return f.__func__(self, *args) | return f.__func__(self, *args) | ||||
return _f | return _f | ||||
else: | |||||
@functools.wraps(f.__call__.__func__) | |||||
def _f(self, args): | |||||
return f.__call__.__func__(self, *args) | |||||
return _f | |||||
try: | try: | ||||
@@ -82,5 +89,24 @@ except NameError: | |||||
return -1 | return -1 | ||||
try: | |||||
from contextlib import suppress # Python 3 | |||||
except ImportError: | |||||
@contextmanager | |||||
def suppress(*excs): | |||||
'''Catch and dismiss the provided exception | |||||
>>> x = 'hello' | |||||
>>> with suppress(IndexError): | |||||
... x = x[10] | |||||
>>> x | |||||
'hello' | |||||
''' | |||||
try: | |||||
yield | |||||
except excs: | |||||
pass | |||||
@@ -380,6 +380,20 @@ def _make_parser_test(LEXER, PARSER): | |||||
x = g.parse('Hello HelloWorld') | x = g.parse('Hello HelloWorld') | ||||
self.assertSequenceEqual(x.children, ['HelloWorld']) | self.assertSequenceEqual(x.children, ['HelloWorld']) | ||||
def test_token_collision2(self): | |||||
# NOTE: This test reveals a bug in token reconstruction in Scanless Earley | |||||
# I probably need to re-write grammar transformation | |||||
g = _Lark(""" | |||||
!start: "starts" | |||||
%import common.LCASE_LETTER | |||||
""") | |||||
x = g.parse("starts") | |||||
self.assertSequenceEqual(x.children, ['starts']) | |||||
# def test_string_priority(self): | # def test_string_priority(self): | ||||
# g = _Lark("""start: (A | /a?bb/)+ | # g = _Lark("""start: (A | /a?bb/)+ | ||||
# A: "a" """) | # A: "a" """) | ||||
@@ -539,6 +553,12 @@ def _make_parser_test(LEXER, PARSER): | |||||
g.parse("+2e-9") | g.parse("+2e-9") | ||||
self.assertRaises(ParseError, g.parse, "+2e-9e") | self.assertRaises(ParseError, g.parse, "+2e-9e") | ||||
def test_keep_all_tokens(self): | |||||
l = _Lark("""start: "a"+ """, keep_all_tokens=True) | |||||
tree = l.parse('aaa') | |||||
self.assertEqual(tree.children, ['a', 'a', 'a']) | |||||
def test_token_flags(self): | def test_token_flags(self): | ||||
l = _Lark("""!start: "a"i+ | l = _Lark("""!start: "a"i+ | ||||
""" | """ | ||||
@@ -569,6 +589,14 @@ def _make_parser_test(LEXER, PARSER): | |||||
tree = l.parse('AB,a') | tree = l.parse('AB,a') | ||||
self.assertEqual(tree.children, ['AB']) | self.assertEqual(tree.children, ['AB']) | ||||
def test_token_flags3(self): | |||||
l = _Lark("""!start: ABC+ | |||||
ABC: "abc"i | |||||
""" | |||||
) | |||||
tree = l.parse('aBcAbC') | |||||
self.assertEqual(tree.children, ['aBc', 'AbC']) | |||||
def test_token_flags2(self): | def test_token_flags2(self): | ||||
g = """!start: ("a"i | /a/ /b/?)+ | g = """!start: ("a"i | /a/ /b/?)+ | ||||
""" | """ | ||||
@@ -577,6 +605,46 @@ def _make_parser_test(LEXER, PARSER): | |||||
self.assertEqual(tree.children, ['a', 'A']) | self.assertEqual(tree.children, ['a', 'A']) | ||||
def test_reduce_cycle(self): | |||||
"""Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. | |||||
It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. | |||||
""" | |||||
l = _Lark(""" | |||||
term: A | |||||
| term term | |||||
A: "a" | |||||
""", start='term') | |||||
tree = l.parse("aa") | |||||
self.assertEqual(len(tree.children), 2) | |||||
@unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules") | |||||
def test_earley_prioritization(self): | |||||
"Tests effect of priority on result" | |||||
grammar = """ | |||||
start: a | b | |||||
a.1: "a" | |||||
b.2: "a" | |||||
""" | |||||
l = Lark(grammar, parser='earley', lexer='standard') | |||||
res = l.parse("a") | |||||
self.assertEqual(res.children[0].data, 'b') | |||||
grammar = """ | |||||
start: a | b | |||||
a.2: "a" | |||||
b.1: "a" | |||||
""" | |||||
l = Lark(grammar, parser='earley', lexer='standard') | |||||
res = l.parse("a") | |||||
self.assertEqual(res.children[0].data, 'a') | |||||