It now knows how to resolve ambiguity! And in a memory-efficient way!tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
@@ -134,7 +134,7 @@ These features may be implemented some day: | |||||
- You can work with parse-trees instead of state-machines | - You can work with parse-trees instead of state-machines | ||||
- The grammar is simple to read and write | - The grammar is simple to read and write | ||||
- There are no restrictions on grammar structure. Any grammar you write can be parsed. | - There are no restrictions on grammar structure. Any grammar you write can be parsed. | ||||
- Some structures are faster than others. If you care about speed, you can learn them gradually while the parser is already working. | |||||
- Some structures are faster than others. If you care about speed, you can learn them gradually while the parser is already working | |||||
- A well-written grammar is very fast | - A well-written grammar is very fast | ||||
- Note: Nondeterminstic grammars will run a little slower | - Note: Nondeterminstic grammars will run a little slower | ||||
- Note: Ambiguous grammars (grammars that can be parsed in more than one way) are supported, but may cause significant slowdown if the ambiguity is too big) | - Note: Ambiguous grammars (grammars that can be parsed in more than one way) are supported, but may cause significant slowdown if the ambiguity is too big) | ||||
@@ -25,7 +25,7 @@ parser = Lark(r""" | |||||
%import common.WS_INLINE | %import common.WS_INLINE | ||||
%ignore WS_INLINE | %ignore WS_INLINE | ||||
""") | |||||
""", lexer=None) | |||||
def test(): | def test(): | ||||
sample_conf = """ | sample_conf = """ | ||||
@@ -29,7 +29,7 @@ class UnexpectedToken(ParseError): | |||||
def is_terminal(sym): | def is_terminal(sym): | ||||
return isinstance(sym, tuple) or sym.isupper() or sym[0] == '$' | |||||
return isinstance(sym, Terminal) or sym.isupper() or sym[0] == '$' | |||||
class LexerConf: | class LexerConf: | ||||
@@ -81,3 +81,26 @@ class TokenDef(object): | |||||
def __repr__(self): | def __repr__(self): | ||||
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | ||||
class Terminal: | |||||
def __init__(self, data): | |||||
self.data = data | |||||
def __repr__(self): | |||||
return '%r' % self.data | |||||
def __eq__(self, other): | |||||
return isinstance(other, type(self)) and self.data == other.data | |||||
def __hash__(self): | |||||
return hash(self.data) | |||||
class Terminal_Regexp(Terminal): | |||||
def __init__(self, data): | |||||
Terminal.__init__(self, data) | |||||
self.match = re.compile(data).match | |||||
class Terminal_Token(Terminal): | |||||
def match(self, other): | |||||
return self.data == other.type | |||||
@@ -159,6 +159,8 @@ class Lark: | |||||
def lex(self, text): | def lex(self, text): | ||||
if not hasattr(self, 'lexer'): | |||||
self.lexer = self._build_lexer() | |||||
stream = self.lexer.lex(text) | stream = self.lexer.lex(text) | ||||
if self.options.postlex: | if self.options.postlex: | ||||
return self.options.postlex.process(stream) | return self.options.postlex.process(stream) | ||||
@@ -67,8 +67,8 @@ TOKENS = { | |||||
'_DOT': r'\.', | '_DOT': r'\.', | ||||
'RULE': '!?[_?]?[a-z][_a-z0-9]*', | 'RULE': '!?[_?]?[a-z][_a-z0-9]*', | ||||
'TOKEN': '_?[A-Z][_A-Z0-9]*', | 'TOKEN': '_?[A-Z][_A-Z0-9]*', | ||||
'STRING': r'"(\\"|\\\\|[^"])*?"', | |||||
'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/', | |||||
'STRING': r'"(\\"|\\\\|[^"\n])*?"', | |||||
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/', | |||||
'_NL': r'(\r?\n)+\s*', | '_NL': r'(\r?\n)+\s*', | ||||
'WS': r'[ \t]+', | 'WS': r'[ \t]+', | ||||
'COMMENT': r'//[^\n]*', | 'COMMENT': r'//[^\n]*', | ||||
@@ -377,11 +377,15 @@ class Grammar: | |||||
else: | else: | ||||
options = RuleOptions.new_from(options, create_token=name) | options = RuleOptions.new_from(options, create_token=name) | ||||
name = tokens_to_convert[name] | name = tokens_to_convert[name] | ||||
inner = Token('RULE', name + '_inner') | |||||
new_rule_defs.append((name, T('expansions', [T('expansion', [inner])]), None)) | |||||
name = inner | |||||
for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): | |||||
for i, sym in enumerate(exp.children): | |||||
if sym in tokens_to_convert: | |||||
exp.children[i] = Token(sym.type, tokens_to_convert[sym]) | |||||
else: | |||||
for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): | |||||
for i, sym in enumerate(exp.children): | |||||
if sym in tokens_to_convert: | |||||
exp.children[i] = Token(sym.type, tokens_to_convert[sym]) | |||||
new_rule_defs.append((name, tree, options)) | new_rule_defs.append((name, tree, options)) | ||||
@@ -3,8 +3,8 @@ import sre_parse | |||||
from .lexer import Lexer, ContextualLexer, Token | from .lexer import Lexer, ContextualLexer, Token | ||||
from .common import is_terminal, GrammarError, ParserConf | |||||
from .parsers import lalr_parser, earley, nearley | |||||
from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token | |||||
from .parsers import lalr_parser, old_earley, nearley, earley | |||||
from .tree import Transformer | from .tree import Transformer | ||||
class WithLexer: | class WithLexer: | ||||
@@ -70,13 +70,13 @@ class Nearley(WithLexer): | |||||
return res[0] | return res[0] | ||||
class Earley(WithLexer): | |||||
class OldEarley(WithLexer): | |||||
def __init__(self, lexer_conf, parser_conf): | def __init__(self, lexer_conf, parser_conf): | ||||
WithLexer.__init__(self, lexer_conf) | WithLexer.__init__(self, lexer_conf) | ||||
rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | ||||
self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||||
self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||||
def _prepare_expansion(self, expansion): | def _prepare_expansion(self, expansion): | ||||
return [(sym,) if is_terminal(sym) else sym for sym in expansion] | return [(sym,) if is_terminal(sym) else sym for sym in expansion] | ||||
@@ -100,13 +100,13 @@ def tokenize_text(text): | |||||
return new_text | return new_text | ||||
class Earley_NoLex: | |||||
class OldEarley_NoLex: | |||||
def __init__(self, lexer_conf, parser_conf): | def __init__(self, lexer_conf, parser_conf): | ||||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | self.token_by_name = {t.name:t for t in lexer_conf.tokens} | ||||
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | ||||
self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||||
self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||||
def _prepare_expansion(self, expansion): | def _prepare_expansion(self, expansion): | ||||
for sym in expansion: | for sym in expansion: | ||||
@@ -125,6 +125,43 @@ class Earley_NoLex: | |||||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | ||||
return res[0] | return res[0] | ||||
class Earley_NoLex: | |||||
def __init__(self, lexer_conf, parser_conf): | |||||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||||
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||||
self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) | |||||
def _prepare_expansion(self, expansion): | |||||
for sym in expansion: | |||||
if is_terminal(sym): | |||||
regexp = self.token_by_name[sym].pattern.to_regexp() | |||||
width = sre_parse.parse(regexp).getwidth() | |||||
if width != (1,1): | |||||
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||||
yield Terminal_Regexp(regexp) | |||||
else: | |||||
yield sym | |||||
def parse(self, text): | |||||
new_text = tokenize_text(text) | |||||
return self.parser.parse(new_text) | |||||
class Earley(WithLexer): | |||||
def __init__(self, lexer_conf, parser_conf): | |||||
WithLexer.__init__(self, lexer_conf) | |||||
rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | |||||
self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) | |||||
def _prepare_expansion(self, expansion): | |||||
return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] | |||||
def parse(self, text): | |||||
tokens = list(self.lex(text)) | |||||
return self.parser.parse(tokens) | |||||
def get_frontend(parser, lexer): | def get_frontend(parser, lexer): | ||||
if parser=='lalr': | if parser=='lalr': | ||||
@@ -1,25 +1,42 @@ | |||||
"This module implements an Earley Parser" | "This module implements an Earley Parser" | ||||
# The parser uses a parse-forest to keep track of derivations and ambiguations. | |||||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||||
# (right now ambiguity resolution is not developed beyond the needs of lark) | |||||
# Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||||
# I use the no-recursion version of Transformer and Visitor, because the tree might be | |||||
# deeper than Python's recursion limit (a bit absurd, but that's life) | |||||
# | |||||
# The algorithm keeps track of each state set, using a corresponding Column instance. | # The algorithm keeps track of each state set, using a corresponding Column instance. | ||||
# Column keeps track of new items using NewsList instances. | # Column keeps track of new items using NewsList instances. | ||||
# | # | ||||
# Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
# Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
from ..common import ParseError, UnexpectedToken, is_terminal | |||||
from functools import cmp_to_key | |||||
from ..utils import compare | |||||
from ..common import ParseError, UnexpectedToken, Terminal | |||||
from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse | |||||
class EndToken: | class EndToken: | ||||
type = '$end' | type = '$end' | ||||
class Derivation(Tree): | |||||
def __init__(self, rule, items=None): | |||||
Tree.__init__(self, 'drv', items or []) | |||||
self.rule = rule | |||||
END_TOKEN = EndToken() | END_TOKEN = EndToken() | ||||
class Item(object): | class Item(object): | ||||
def __init__(self, rule, ptr, start, data): | |||||
def __init__(self, rule, ptr, start, tree): | |||||
self.rule = rule | self.rule = rule | ||||
self.ptr = ptr | self.ptr = ptr | ||||
self.start = start | self.start = start | ||||
self.data = data | |||||
self.tree = tree if tree is not None else Derivation(self.rule) | |||||
@property | @property | ||||
def expect(self): | def expect(self): | ||||
@@ -29,8 +46,10 @@ class Item(object): | |||||
def is_complete(self): | def is_complete(self): | ||||
return self.ptr == len(self.rule.expansion) | return self.ptr == len(self.rule.expansion) | ||||
def advance(self, data): | |||||
return Item(self.rule, self.ptr+1, self.start, self.data + [data]) | |||||
def advance(self, tree): | |||||
assert self.tree.data == 'drv' | |||||
new_tree = Derivation(self.rule, self.tree.children + [tree]) | |||||
return Item(self.rule, self.ptr+1, self.start, new_tree) | |||||
def __eq__(self, other): | def __eq__(self, other): | ||||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | ||||
@@ -38,8 +57,8 @@ class Item(object): | |||||
return hash((self.rule, self.ptr, id(self.start))) | return hash((self.rule, self.ptr, id(self.start))) | ||||
def __repr__(self): | def __repr__(self): | ||||
before = map(str, self.rule.expansion[:self.ptr]) | |||||
after = map(str, self.rule.expansion[self.ptr:]) | |||||
before = list(map(str, self.rule.expansion[:self.ptr])) | |||||
after = list(map(str, self.rule.expansion[self.ptr:])) | |||||
return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | ||||
@@ -56,15 +75,18 @@ class NewsList(list): | |||||
return self[i:] | return self[i:] | ||||
class Column: | class Column: | ||||
"An entry in the table, aka Earley Chart" | "An entry in the table, aka Earley Chart" | ||||
def __init__(self): | |||||
def __init__(self, i): | |||||
self.i = i | |||||
self.to_reduce = NewsList() | self.to_reduce = NewsList() | ||||
self.to_predict = NewsList() | self.to_predict = NewsList() | ||||
self.to_scan = NewsList() | self.to_scan = NewsList() | ||||
self.item_count = 0 | self.item_count = 0 | ||||
self.added = set() | self.added = set() | ||||
self.completed = {} | |||||
def add(self, items): | def add(self, items): | ||||
"""Sort items into scan/predict/reduce newslists | """Sort items into scan/predict/reduce newslists | ||||
@@ -76,29 +98,24 @@ class Column: | |||||
for item in items: | for item in items: | ||||
if item.is_complete: | if item.is_complete: | ||||
# (We must allow repetition of empty rules) | |||||
if item.rule.expansion: | |||||
# This is an important test to avoid infinite-loops, | |||||
# For example for the rule: | |||||
# a: a | "b" | |||||
# If we can detect these cases statically, we can remove | |||||
# this test an gain a tiny performance boost | |||||
# | |||||
if item in added: | |||||
continue | |||||
added.add(item) | |||||
self.to_reduce.append(item) | |||||
else: | |||||
if is_terminal(item.expect): | |||||
self.to_scan.append(item) | |||||
# XXX TODO Potential bug: What happens if there's ambiguity in an empty rule? | |||||
if item.rule.expansion and item in self.completed: | |||||
old_tree = self.completed[item].tree | |||||
if old_tree.data != 'ambig': | |||||
new_tree = old_tree.copy() | |||||
new_tree.rule = old_tree.rule | |||||
old_tree.set('ambig', [new_tree]) | |||||
old_tree.children.append(item.tree) | |||||
else: | else: | ||||
if item in added: | |||||
continue | |||||
self.completed[item] = item | |||||
self.to_reduce.append(item) | |||||
else: | |||||
if item not in added: | |||||
added.add(item) | added.add(item) | ||||
self.to_predict.append(item) | |||||
if isinstance(item.expect, Terminal): | |||||
self.to_scan.append(item) | |||||
else: | |||||
self.to_predict.append(item) | |||||
self.item_count += 1 # Only count if actually added | self.item_count += 1 # Only count if actually added | ||||
@@ -106,17 +123,16 @@ class Column: | |||||
return bool(self.item_count) | return bool(self.item_count) | ||||
class Parser: | class Parser: | ||||
def __init__(self, parser_conf): | |||||
self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||||
self.start = parser_conf.start | |||||
def __init__(self, rules, start, callback): | |||||
self.analysis = GrammarAnalyzer(rules, start) | |||||
self.start = start | |||||
self.postprocess = {} | self.postprocess = {} | ||||
self.predictions = {} | self.predictions = {} | ||||
for rule in self.analysis.rules: | for rule in self.analysis.rules: | ||||
if rule.origin != '$root': # XXX kinda ugly | if rule.origin != '$root': # XXX kinda ugly | ||||
a = rule.alias | a = rule.alias | ||||
self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a) | |||||
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) | |||||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | ||||
def parse(self, stream, start=None): | def parse(self, stream, start=None): | ||||
@@ -124,16 +140,15 @@ class Parser: | |||||
start = start or self.start | start = start or self.start | ||||
def predict(nonterm, i): | def predict(nonterm, i): | ||||
assert not is_terminal(nonterm), nonterm | |||||
return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]] | |||||
assert not isinstance(nonterm, Terminal), nonterm | |||||
return [Item(rule, 0, i, None) for rule in self.predictions[nonterm]] | |||||
def complete(item): | def complete(item): | ||||
name = item.rule.origin | name = item.rule.origin | ||||
item.data = self.postprocess[item.rule](item.data) | |||||
return [i.advance(item.data) for i in item.start.to_predict if i.expect == name] | |||||
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] | |||||
def process_column(i, token, cur_set): | def process_column(i, token, cur_set): | ||||
next_set = Column() | |||||
next_set = Column(i) | |||||
while True: | while True: | ||||
to_predict = {x.expect for x in cur_set.to_predict.get_news() | to_predict = {x.expect for x in cur_set.to_predict.get_news() | ||||
@@ -147,21 +162,20 @@ class Parser: | |||||
for item in to_reduce: | for item in to_reduce: | ||||
cur_set.add( complete(item) ) | cur_set.add( complete(item) ) | ||||
if token is not END_TOKEN: | if token is not END_TOKEN: | ||||
for item in cur_set.to_scan.get_news(): | |||||
match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type | |||||
if match: | |||||
to_scan = cur_set.to_scan.get_news() | |||||
for item in to_scan: | |||||
if item.expect.match(token): | |||||
next_set.add([item.advance(stream[i])]) | next_set.add([item.advance(stream[i])]) | ||||
if not next_set and token is not END_TOKEN: | if not next_set and token is not END_TOKEN: | ||||
expect = {i.expect[-1] for i in cur_set.to_scan} | |||||
expect = {i.expect for i in cur_set.to_scan} | |||||
raise UnexpectedToken(token, expect, stream, i) | raise UnexpectedToken(token, expect, stream, i) | ||||
return cur_set, next_set | return cur_set, next_set | ||||
# Main loop starts | # Main loop starts | ||||
column0 = Column() | |||||
column0 = Column(0) | |||||
column0.add(predict(start, column0)) | column0.add(predict(start, column0)) | ||||
cur_set = column0 | cur_set = column0 | ||||
@@ -171,10 +185,83 @@ class Parser: | |||||
last_set, _ = process_column(len(stream), END_TOKEN, cur_set) | last_set, _ = process_column(len(stream), END_TOKEN, cur_set) | ||||
# Parse ended. Now build a parse tree | # Parse ended. Now build a parse tree | ||||
solutions = [n.data for n in last_set.to_reduce | |||||
solutions = [n.tree for n in last_set.to_reduce | |||||
if n.rule.origin==start and n.start is column0] | if n.rule.origin==start and n.start is column0] | ||||
if not solutions: | if not solutions: | ||||
raise ParseError('Incomplete parse: Could not find a solution to input') | raise ParseError('Incomplete parse: Could not find a solution to input') | ||||
return solutions | |||||
elif len(solutions) == 1: | |||||
tree = solutions[0] | |||||
else: | |||||
tree = Tree('ambig', solutions) | |||||
ResolveAmbig().visit(tree) | |||||
return ApplyCallbacks(self.postprocess).transform(tree) | |||||
class ApplyCallbacks(Transformer_NoRecurse): | |||||
def __init__(self, postprocess): | |||||
self.postprocess = postprocess | |||||
def drv(self, tree): | |||||
children = tree.children | |||||
callback = self.postprocess[tree.rule] | |||||
if callback: | |||||
return callback(children) | |||||
else: | |||||
return Tree(rule.origin, children) | |||||
def _compare_rules(rule1, rule2): | |||||
assert rule1.origin == rule2.origin | |||||
c = compare( len(rule1.expansion), len(rule2.expansion)) | |||||
if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | |||||
return c | |||||
else: | |||||
return -c | |||||
def _compare_drv(tree1, tree2): | |||||
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||||
return compare(tree1, tree2) | |||||
c = _compare_rules(tree1.rule, tree2.rule) | |||||
if c: | |||||
return c | |||||
# rules are "equal", so compare trees | |||||
for t1, t2 in zip(tree1.children, tree2.children): | |||||
c = _compare_drv(t1, t2) | |||||
if c: | |||||
return c | |||||
return compare(len(tree1.children), len(tree2.children)) | |||||
class ResolveAmbig(Visitor_NoRecurse): | |||||
def ambig(self, tree): | |||||
best = max(tree.children, key=cmp_to_key(_compare_drv)) | |||||
assert best.data == 'drv' | |||||
tree.set('drv', best.children) | |||||
tree.rule = best.rule # needed for applying callbacks | |||||
# RULES = [ | |||||
# ('a', ['d']), | |||||
# ('d', ['b']), | |||||
# ('b', ['C']), | |||||
# ('b', ['b', 'C']), | |||||
# ('b', ['C', 'b']), | |||||
# ] | |||||
# p = Parser(RULES, 'a') | |||||
# for x in p.parse('CC'): | |||||
# print x.pretty() | |||||
#--------------- | |||||
# RULES = [ | |||||
# ('s', ['a', 'a']), | |||||
# ('a', ['b', 'b']), | |||||
# ('b', ['C'], lambda (x,): x), | |||||
# ('b', ['b', 'C']), | |||||
# ] | |||||
# p = Parser(RULES, 's', {}) | |||||
# print p.parse('CCCCC').pretty() |
@@ -13,7 +13,7 @@ class Rule(object): | |||||
self.alias = alias | self.alias = alias | ||||
def __repr__(self): | def __repr__(self): | ||||
return '<%s : %s>' % (self.origin, ' '.join(map(unicode,self.expansion))) | |||||
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||||
class RulePtr(object): | class RulePtr(object): | ||||
def __init__(self, rule, index): | def __init__(self, rule, index): | ||||
@@ -0,0 +1,180 @@ | |||||
"This module implements an Earley Parser" | |||||
# The algorithm keeps track of each state set, using a corresponding Column instance. | |||||
# Column keeps track of new items using NewsList instances. | |||||
# | |||||
# Author: Erez Shinan (2017) | |||||
# Email : erezshin@gmail.com | |||||
from ..common import ParseError, UnexpectedToken, is_terminal | |||||
from .grammar_analysis import GrammarAnalyzer | |||||
class EndToken: | |||||
type = '$end' | |||||
END_TOKEN = EndToken() | |||||
class Item(object): | |||||
def __init__(self, rule, ptr, start, data): | |||||
self.rule = rule | |||||
self.ptr = ptr | |||||
self.start = start | |||||
self.data = data | |||||
@property | |||||
def expect(self): | |||||
return self.rule.expansion[self.ptr] | |||||
@property | |||||
def is_complete(self): | |||||
return self.ptr == len(self.rule.expansion) | |||||
def advance(self, data): | |||||
return Item(self.rule, self.ptr+1, self.start, self.data + [data]) | |||||
def __eq__(self, other): | |||||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||||
def __hash__(self): | |||||
return hash((self.rule, self.ptr, id(self.start))) | |||||
def __repr__(self): | |||||
before = map(str, self.rule.expansion[:self.ptr]) | |||||
after = map(str, self.rule.expansion[self.ptr:]) | |||||
return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||||
class NewsList(list): | |||||
"Keeps track of newly added items (append-only)" | |||||
def __init__(self, initial=None): | |||||
list.__init__(self, initial or []) | |||||
self.last_iter = 0 | |||||
def get_news(self): | |||||
i = self.last_iter | |||||
self.last_iter = len(self) | |||||
return self[i:] | |||||
class Column: | |||||
"An entry in the table, aka Earley Chart" | |||||
def __init__(self): | |||||
self.to_reduce = NewsList() | |||||
self.to_predict = NewsList() | |||||
self.to_scan = NewsList() | |||||
self.item_count = 0 | |||||
self.added = set() | |||||
def add(self, items): | |||||
"""Sort items into scan/predict/reduce newslists | |||||
Makes sure only unique items are added. | |||||
""" | |||||
added = self.added | |||||
for item in items: | |||||
if item.is_complete: | |||||
# (We must allow repetition of empty rules) | |||||
# if item.rule.expansion: | |||||
# This is an important test to avoid infinite-loops, | |||||
# For example for the rule: | |||||
# a: a | "b" | |||||
# If we can detect these cases statically, we can remove | |||||
# this test an gain a tiny performance boost | |||||
# | |||||
# if item in added: | |||||
# continue | |||||
# added.add(item) | |||||
self.to_reduce.append(item) | |||||
else: | |||||
if is_terminal(item.expect): | |||||
self.to_scan.append(item) | |||||
else: | |||||
if item in added: | |||||
continue | |||||
added.add(item) | |||||
self.to_predict.append(item) | |||||
self.item_count += 1 # Only count if actually added | |||||
def __nonzero__(self): | |||||
return bool(self.item_count) | |||||
class Parser: | |||||
def __init__(self, parser_conf): | |||||
self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||||
self.start = parser_conf.start | |||||
self.postprocess = {} | |||||
self.predictions = {} | |||||
for rule in self.analysis.rules: | |||||
if rule.origin != '$root': # XXX kinda ugly | |||||
a = rule.alias | |||||
self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a) | |||||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||||
def parse(self, stream, start=None): | |||||
# Define parser functions | |||||
start = start or self.start | |||||
def predict(nonterm, i): | |||||
assert not is_terminal(nonterm), nonterm | |||||
return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]] | |||||
def complete(item): | |||||
name = item.rule.origin | |||||
item.data = self.postprocess[item.rule](item.data) | |||||
return [i.advance(item.data) for i in item.start.to_predict if i.expect == name] | |||||
def process_column(i, token, cur_set): | |||||
next_set = Column() | |||||
while True: | |||||
to_predict = {x.expect for x in cur_set.to_predict.get_news() | |||||
if x.ptr} # if not part of an already predicted batch | |||||
to_reduce = cur_set.to_reduce.get_news() | |||||
if not (to_predict or to_reduce): | |||||
break | |||||
for nonterm in to_predict: | |||||
cur_set.add( predict(nonterm, cur_set) ) | |||||
for item in to_reduce: | |||||
cur_set.add( complete(item) ) | |||||
if token is not END_TOKEN: | |||||
for item in cur_set.to_scan.get_news(): | |||||
match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type | |||||
if match: | |||||
next_set.add([item.advance(stream[i])]) | |||||
if not next_set and token is not END_TOKEN: | |||||
expect = {i.expect[-1] for i in cur_set.to_scan} | |||||
raise UnexpectedToken(token, expect, stream, i) | |||||
return cur_set, next_set | |||||
# Main loop starts | |||||
column0 = Column() | |||||
column0.add(predict(start, column0)) | |||||
cur_set = column0 | |||||
for i, char in enumerate(stream): | |||||
_, cur_set = process_column(i, char, cur_set) | |||||
last_set, _ = process_column(len(stream), END_TOKEN, cur_set) | |||||
# Parse ended. Now build a parse tree | |||||
solutions = [n.data for n in last_set.to_reduce | |||||
if n.rule.origin==start and n.start is column0] | |||||
if not solutions: | |||||
raise ParseError('Incomplete parse: Could not find a solution to input') | |||||
return solutions |
@@ -2,7 +2,7 @@ import re | |||||
from collections import defaultdict | from collections import defaultdict | ||||
from .tree import Tree | from .tree import Tree | ||||
from .common import is_terminal, ParserConf, PatternStr | |||||
from .common import is_terminal, ParserConf, PatternStr, Terminal | |||||
from .lexer import Token | from .lexer import Token | ||||
from .parsers import earley | from .parsers import earley | ||||
@@ -26,21 +26,14 @@ class Reconstructor: | |||||
token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in _tokens} | token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in _tokens} | ||||
class MatchData(object): | |||||
def __init__(self, data): | |||||
self.data = data | |||||
def __repr__(self): | |||||
return '%s(%r)' % (type(self).__name__, self.data) | |||||
class MatchTerminal(MatchData): | |||||
def __call__(self, other): | |||||
class MatchTerminal(Terminal): | |||||
def match(self, other): | |||||
if isinstance(other, Tree): | if isinstance(other, Tree): | ||||
return False | return False | ||||
return token_res[self.data].match(other) is not None | return token_res[self.data].match(other) is not None | ||||
class MatchTree(MatchData): | |||||
def __call__(self, other): | |||||
class MatchTree(Terminal): | |||||
def match(self, other): | |||||
try: | try: | ||||
return self.data == other.data | return self.data == other.data | ||||
except AttributeError: | except AttributeError: | ||||
@@ -90,7 +83,7 @@ class Reconstructor: | |||||
for name, expansions in d.items(): | for name, expansions in d.items(): | ||||
for expansion in expansions: | for expansion in expansions: | ||||
reduced = [sym if sym.startswith('_') or sym in expand1s else | reduced = [sym if sym.startswith('_') or sym in expand1s else | ||||
(MatchTerminal(sym) if is_terminal(sym) else MatchTree(sym),) | |||||
MatchTerminal(sym) if is_terminal(sym) else MatchTree(sym) | |||||
for sym in expansion if not is_discarded_terminal(sym)] | for sym in expansion if not is_discarded_terminal(sym)] | ||||
rules.append((name, reduced, WriteTokens(name, expansion).f)) | rules.append((name, reduced, WriteTokens(name, expansion).f)) | ||||
@@ -98,9 +91,9 @@ class Reconstructor: | |||||
def _reconstruct(self, tree): | def _reconstruct(self, tree): | ||||
parser = earley.Parser(ParserConf(self.rules, {}, tree.data)) | |||||
res ,= parser.parse(tree.children) # XXX ambiguity? | |||||
# TODO: ambiguity? | |||||
parser = earley.Parser(self.rules, tree.data, {}) | |||||
res = parser.parse(tree.children) | |||||
for item in res: | for item in res: | ||||
if isinstance(item, Tree): | if isinstance(item, Tree): | ||||
for x in self._reconstruct(item): | for x in self._reconstruct(item): | ||||
@@ -32,7 +32,11 @@ class Tree(object): | |||||
self.children[i:i+1] = kid.children | self.children[i:i+1] = kid.children | ||||
def __eq__(self, other): | def __eq__(self, other): | ||||
return self.data == other.data and self.children == other.children | |||||
try: | |||||
return self.data == other.data and self.children == other.children | |||||
except AttributeError: | |||||
return False | |||||
def __hash__(self): | def __hash__(self): | ||||
return hash((self.data, tuple(self.children))) | return hash((self.data, tuple(self.children))) | ||||
@@ -57,10 +61,24 @@ class Tree(object): | |||||
if pred(c): | if pred(c): | ||||
yield c | yield c | ||||
def iter_subtrees(self): | |||||
q = [self] | |||||
while q: | |||||
subtree = q.pop() | |||||
yield subtree | |||||
q += [c for c in subtree.children if isinstance(c, Tree)] | |||||
def __deepcopy__(self, memo): | def __deepcopy__(self, memo): | ||||
return type(self)(self.data, deepcopy(self.children, memo)) | return type(self)(self.data, deepcopy(self.children, memo)) | ||||
def copy(self): | |||||
return type(self)(self.data, self.children) | |||||
def set(self, data, children): | |||||
self.data = data | |||||
self.children = children | |||||
class Transformer(object): | class Transformer(object): | ||||
@@ -81,7 +99,7 @@ class Transformer(object): | |||||
class InlineTransformer(Transformer): | class InlineTransformer(Transformer): | ||||
def _get_func(self, name): | |||||
def _get_func(self, name): # use super()._get_func | |||||
return inline_args(getattr(self, name)).__get__(self) | return inline_args(getattr(self, name)).__get__(self) | ||||
@@ -97,3 +115,35 @@ class Visitor(object): | |||||
def __default__(self, tree): | def __default__(self, tree): | ||||
pass | pass | ||||
class Visitor_NoRecurse(Visitor): | |||||
def visit(self, tree): | |||||
subtrees = list(tree.iter_subtrees()) | |||||
for subtree in reversed(subtrees): | |||||
getattr(self, subtree.data, self.__default__)(subtree) | |||||
return tree | |||||
class Transformer_NoRecurse(Transformer): | |||||
def transform(self, tree): | |||||
subtrees = list(tree.iter_subtrees()) | |||||
def _t(t): | |||||
# Assumes t is already transformed | |||||
try: | |||||
f = self._get_func(t.data) | |||||
except AttributeError: | |||||
return self.__default__(t) | |||||
else: | |||||
return f(t) | |||||
for subtree in reversed(subtrees): | |||||
subtree.children = [_t(c) if isinstance(c, Tree) else c for c in subtree.children] | |||||
return _t(tree) | |||||
def __default__(self, t): | |||||
return t | |||||
@@ -69,3 +69,14 @@ def inline_args(f): | |||||
return f.__func__(self, *args) | return f.__func__(self, *args) | ||||
return _f | return _f | ||||
try: | |||||
compare = cmp | |||||
except NameError: | |||||
def compare(a, b): | |||||
if a == b: | |||||
return 0 | |||||
elif a > b: | |||||
return 1 | |||||
else: | |||||
return -1 |
@@ -73,6 +73,28 @@ class TestEarley(unittest.TestCase): | |||||
l = Lark(grammar, parser='earley', lexer=None) | l = Lark(grammar, parser='earley', lexer=None) | ||||
l.parse(program) | l.parse(program) | ||||
def test_earley_scanless3(self): | |||||
"Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||||
grammar = """ | |||||
start: A A | |||||
A: "a"+ | |||||
""" | |||||
l = Lark(grammar, parser='earley', lexer=None) | |||||
res = l.parse("aaa") | |||||
self.assertEqual(res.children, ['aa', 'a']) | |||||
def test_earley_scanless4(self): | |||||
grammar = """ | |||||
start: A A? | |||||
A: "a"+ | |||||
""" | |||||
l = Lark(grammar, parser='earley', lexer=None) | |||||
res = l.parse("aaa") | |||||
self.assertEqual(res.children, ['aaa']) | |||||
def _make_parser_test(LEXER, PARSER): | def _make_parser_test(LEXER, PARSER): | ||||
def _Lark(grammar, **kwargs): | def _Lark(grammar, **kwargs): | ||||
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) | return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) | ||||