@@ -47,12 +47,12 @@ class TreeToJson(Transformer): | |||
true = lambda self, _: True | |||
false = lambda self, _: False | |||
json_parser = Lark(json_grammar, parser='earley', lexer='dynamic') | |||
def parse(x): | |||
return TreeToJson().transform(json_parser.parse(x)) | |||
# json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||
# def parse(x): | |||
# return TreeToJson().transform(json_parser.parse(x)) | |||
# json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
# parse = json_parser.parse | |||
json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
parse = json_parser.parse | |||
def test(): | |||
test_json = ''' | |||
@@ -4,7 +4,7 @@ import sre_parse | |||
from .lexer import Lexer, ContextualLexer, Token | |||
from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token | |||
from .parsers import lalr_parser, old_earley, nearley, earley | |||
from .parsers import lalr_parser, earley | |||
from .tree import Transformer | |||
from .parsers import xearley | |||
@@ -49,47 +49,6 @@ class LALR_ContextualLexer: | |||
tokens = self.lexer_conf.postlex.process(tokens) | |||
return self.parser.parse(tokens, self.lexer.set_parser_state) | |||
class Nearley(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf): | |||
WithLexer.__init__(self, lexer_conf) | |||
rules = [{'name':n, | |||
'symbols': self._prepare_expansion(x), | |||
'postprocess': getattr(parser_conf.callback, a)} | |||
for n,x,a in parser_conf.rules] | |||
self.parser = nearley.Parser(rules, parser_conf.start) | |||
def _prepare_expansion(self, expansion): | |||
return [(sym, None) if is_terminal(sym) else sym for sym in expansion] | |||
def parse(self, text): | |||
tokens = list(self.lex(text)) | |||
res = self.parser.parse(tokens) | |||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
return res[0] | |||
class OldEarley(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf): | |||
WithLexer.__init__(self, lexer_conf) | |||
rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | |||
self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
def _prepare_expansion(self, expansion): | |||
return [(sym,) if is_terminal(sym) else sym for sym in expansion] | |||
def parse(self, text): | |||
tokens = list(self.lex(text)) | |||
res = self.parser.parse(tokens) | |||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
return res[0] | |||
def tokenize_text(text): | |||
new_text = [] | |||
line = 1 | |||
@@ -101,32 +60,6 @@ def tokenize_text(text): | |||
new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos)) | |||
return new_text | |||
class OldEarley_NoLex: | |||
def __init__(self, lexer_conf, parser_conf): | |||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||
self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
def _prepare_expansion(self, expansion): | |||
for sym in expansion: | |||
if is_terminal(sym): | |||
regexp = self.token_by_name[sym].pattern.to_regexp() | |||
width = sre_parse.parse(regexp).getwidth() | |||
if width != (1,1): | |||
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||
yield (re.compile(regexp).match, regexp) | |||
else: | |||
yield sym | |||
def parse(self, text): | |||
new_text = tokenize_text(text) | |||
res = self.parser.parse(new_text) | |||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
return res[0] | |||
class Earley_NoLex: | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
@@ -178,7 +111,7 @@ class XEarley: | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||
rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | |||
resolve_ambiguity = (options.ambiguity=='resolve') if options else True | |||
ignore = [Terminal_Regexp(self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||
@@ -195,6 +128,7 @@ class XEarley: | |||
if is_terminal(sym): | |||
regexp = self.token_by_name[sym].pattern.to_regexp() | |||
width = sre_parse.parse(regexp).getwidth() | |||
assert width | |||
yield Terminal_Regexp(regexp) | |||
else: | |||
yield sym | |||
@@ -238,13 +238,13 @@ def _compare_rules(rule1, rule2): | |||
def _compare_drv(tree1, tree2): | |||
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||
return compare(tree1, tree2) | |||
return -compare(tree1, tree2) | |||
try: | |||
rule1, rule2 = tree1.rule, tree2.rule | |||
except AttributeError: | |||
# Probably trees that don't take part in this parse (better way to distinguish?) | |||
return compare(tree1, tree2) | |||
return -compare(tree1, tree2) | |||
# XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, | |||
# when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be | |||
@@ -264,7 +264,7 @@ def _compare_drv(tree1, tree2): | |||
if c: | |||
return c | |||
return compare(len(tree1.children), len(tree2.children)) | |||
return -compare(len(tree1.children), len(tree2.children)) | |||
def _resolve_ambig(tree): | |||
@@ -1,4 +1,4 @@ | |||
"This module implements an Earley Parser" | |||
"This module implements an experimental Earley Parser with a dynamic lexer" | |||
# The parser uses a parse-forest to keep track of derivations and ambiguations. | |||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||
@@ -10,121 +10,21 @@ | |||
# The algorithm keeps track of each state set, using a corresponding Column instance. | |||
# Column keeps track of new items using NewsList instances. | |||
# | |||
# Instead of running a lexer beforehand, or using a costy char-by-char method, this parser | |||
# uses regular expressions by necessity, achieving high-performance while maintaining all of | |||
# Earley's power in parsing any CFG. | |||
# | |||
# | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
from functools import cmp_to_key | |||
from collections import defaultdict | |||
from ..utils import compare | |||
from ..common import ParseError, UnexpectedToken, Terminal | |||
from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse | |||
from ..tree import Tree | |||
from .grammar_analysis import GrammarAnalyzer | |||
class EndToken: | |||
type = '$end' | |||
class Derivation(Tree): | |||
def __init__(self, rule, items=None): | |||
Tree.__init__(self, 'drv', items or []) | |||
self.rule = rule | |||
END_TOKEN = EndToken() | |||
class Item(object): | |||
"An Earley Item, the atom of the algorithm." | |||
def __init__(self, rule, ptr, start, tree): | |||
self.rule = rule | |||
self.ptr = ptr | |||
self.start = start | |||
self.tree = tree if tree is not None else Derivation(self.rule) | |||
@property | |||
def expect(self): | |||
return self.rule.expansion[self.ptr] | |||
@property | |||
def is_complete(self): | |||
return self.ptr == len(self.rule.expansion) | |||
def advance(self, tree): | |||
assert self.tree.data == 'drv' | |||
new_tree = Derivation(self.rule, self.tree.children + [tree]) | |||
return Item(self.rule, self.ptr+1, self.start, new_tree) | |||
def __eq__(self, other): | |||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
def __hash__(self): | |||
return hash((self.rule, self.ptr, id(self.start))) | |||
def __repr__(self): | |||
before = list(map(str, self.rule.expansion[:self.ptr])) | |||
after = list(map(str, self.rule.expansion[self.ptr:])) | |||
return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||
class NewsList(list): | |||
"Keeps track of newly added items (append-only)" | |||
def __init__(self, initial=None): | |||
list.__init__(self, initial or []) | |||
self.last_iter = 0 | |||
def get_news(self): | |||
i = self.last_iter | |||
self.last_iter = len(self) | |||
return self[i:] | |||
class Column: | |||
"An entry in the table, aka Earley Chart. Contains lists of items." | |||
def __init__(self, i): | |||
self.i = i | |||
self.to_reduce = NewsList() | |||
self.to_predict = NewsList() | |||
self.to_scan = NewsList() | |||
self.item_count = 0 | |||
self.added = set() | |||
self.completed = {} | |||
def add(self, items): | |||
"""Sort items into scan/predict/reduce newslists | |||
Makes sure only unique items are added. | |||
""" | |||
for item in items: | |||
if item.is_complete: | |||
# XXX Potential bug: What happens if there's ambiguity in an empty rule? | |||
if item.rule.expansion and item in self.completed: | |||
old_tree = self.completed[item].tree | |||
if old_tree.data != '_ambig': | |||
new_tree = old_tree.copy() | |||
new_tree.rule = old_tree.rule | |||
old_tree.set('_ambig', [new_tree]) | |||
if item.tree.children[0] is old_tree: # XXX a little hacky! | |||
raise ParseError("Infinite recursion in grammar!") | |||
old_tree.children.append(item.tree) | |||
else: | |||
self.completed[item] = item | |||
self.to_reduce.append(item) | |||
else: | |||
if item not in self.added: | |||
self.added.add(item) | |||
if isinstance(item.expect, Terminal): | |||
self.to_scan.append(item) | |||
else: | |||
self.to_predict.append(item) | |||
self.item_count += 1 # Only count if actually added | |||
def __nonzero__(self): | |||
return bool(self.item_count) | |||
from earley import ResolveAmbig, ApplyCallbacks, Item, NewsList, Derivation, END_TOKEN, Column | |||
class Parser: | |||
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=True, ignore=()): | |||
@@ -144,7 +44,7 @@ class Parser: | |||
def parse(self, stream, start_symbol=None): | |||
# Define parser functions | |||
start_symbol = start_symbol or self.start_symbol | |||
matched_terminals = defaultdict(list) | |||
delayed_matches = defaultdict(list) | |||
def predict(nonterm, column): | |||
assert not isinstance(nonterm, Terminal), nonterm | |||
@@ -178,16 +78,17 @@ class Parser: | |||
for item in to_scan: | |||
m = item.expect.match(stream, i) | |||
if m: | |||
matched_terminals[m.end()].append(item.advance(m.group(0))) | |||
delayed_matches[m.end()].append(item.advance(m.group(0))) | |||
s = m.group(0) | |||
for j in range(1, len(s)): | |||
m = item.expect.match(s[:-j]) | |||
if m: | |||
matched_terminals[m.end()].append(item.advance(m.group(0))) | |||
delayed_matches[m.end()].append(item.advance(m.group(0))) | |||
next_set = Column(i+1) | |||
next_set.add(matched_terminals[i+1]) | |||
next_set.add(delayed_matches[i+1]) | |||
del delayed_matches[i+1] # No longer needed, so unburden memory | |||
return next_set | |||
@@ -220,73 +121,3 @@ class Parser: | |||
return ApplyCallbacks(self.postprocess).transform(tree) | |||
class ApplyCallbacks(Transformer_NoRecurse): | |||
def __init__(self, postprocess): | |||
self.postprocess = postprocess | |||
def drv(self, tree): | |||
children = tree.children | |||
callback = self.postprocess[tree.rule] | |||
if callback: | |||
return callback(children) | |||
else: | |||
return Tree(rule.origin, children) | |||
def _compare_rules(rule1, rule2): | |||
assert rule1.origin == rule2.origin | |||
c = compare( len(rule1.expansion), len(rule2.expansion)) | |||
if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | |||
c = -c | |||
return c | |||
def _compare_drv(tree1, tree2): | |||
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||
return -compare(tree1, tree2) | |||
c = _compare_rules(tree1.rule, tree2.rule) | |||
if c: | |||
return c | |||
# rules are "equal", so compare trees | |||
for t1, t2 in zip(tree1.children, tree2.children): | |||
c = _compare_drv(t1, t2) | |||
if c: | |||
return c | |||
return -compare(len(tree1.children), len(tree2.children)) | |||
class ResolveAmbig(Visitor_NoRecurse): | |||
"""Resolves ambiguity in resulting parse tree. | |||
Minimizes rule length, maximizes match length. | |||
""" | |||
def _ambig(self, tree): | |||
best = min(tree.children, key=cmp_to_key(_compare_drv)) | |||
assert best.data == 'drv' | |||
tree.set('drv', best.children) | |||
tree.rule = best.rule # needed for applying callbacks | |||
# RULES = [ | |||
# ('a', ['d']), | |||
# ('d', ['b']), | |||
# ('b', ['C']), | |||
# ('b', ['b', 'C']), | |||
# ('b', ['C', 'b']), | |||
# ] | |||
# p = Parser(RULES, 'a') | |||
# for x in p.parse('CC'): | |||
# print x.pretty() | |||
#--------------- | |||
# RULES = [ | |||
# ('s', ['a', 'a']), | |||
# ('a', ['b', 'b']), | |||
# ('b', ['C'], lambda (x,): x), | |||
# ('b', ['b', 'C']), | |||
# ] | |||
# p = Parser(RULES, 's', {}) | |||
# print p.parse('CCCCC').pretty() |
@@ -4,10 +4,23 @@ import unittest | |||
import logging | |||
from .test_trees import TestTrees | |||
# from .test_selectors import TestSelectors | |||
from .test_parser import TestLalrStandard, TestEarleyStandard, TestLalrContextual, TestParsers, TestEarleyScanless, TestEarley, TestEarleyDynamic | |||
# from .test_grammars import TestPythonG, TestConfigG | |||
from .test_parser import ( | |||
TestLalrStandard, | |||
TestEarleyStandard, | |||
TestLalrContextual, | |||
TestEarleyScanless, | |||
TestEarleyDynamic, | |||
TestFullEarleyScanless, | |||
TestFullEarleyDynamic, | |||
TestParsers, | |||
) | |||
logging.basicConfig(level=logging.INFO) | |||
if __name__ == '__main__': | |||
@@ -51,90 +51,95 @@ class TestParsers(unittest.TestCase): | |||
self.assertRaises(ParseError, l.parse, 'a') | |||
class TestEarley(unittest.TestCase): | |||
def test_anon_in_scanless(self): | |||
# Fails an Earley implementation without special handling for empty rules, | |||
# or re-processing of already completed rules. | |||
g = Lark(r"""start: B | |||
B: ("ab"|/[^b]/)* | |||
""", lexer='dynamic') | |||
def _make_full_earley_test(LEXER): | |||
class _TestFullEarley(unittest.TestCase): | |||
def test_anon_in_scanless(self): | |||
# Fails an Earley implementation without special handling for empty rules, | |||
# or re-processing of already completed rules. | |||
g = Lark(r"""start: B | |||
B: ("ab"|/[^b]/)* | |||
""", lexer=LEXER) | |||
self.assertEqual( g.parse('abc').children[0], 'abc') | |||
self.assertEqual( g.parse('abc').children[0], 'abc') | |||
def test_earley_scanless(self): | |||
g = Lark("""start: A "b" c | |||
A: "a"+ | |||
c: "abc" | |||
""", parser="earley", lexer='dynamic') | |||
x = g.parse('aaaababc') | |||
def test_earley_scanless(self): | |||
g = Lark("""start: A "b" c | |||
A: "a"+ | |||
c: "abc" | |||
""", parser="earley", lexer=LEXER) | |||
x = g.parse('aaaababc') | |||
def test_earley_scanless2(self): | |||
grammar = """ | |||
start: statement+ | |||
def test_earley_scanless2(self): | |||
grammar = """ | |||
start: statement+ | |||
statement: "r" | |||
| "c" /[a-z]/+ | |||
statement: "r" | |||
| "c" /[a-z]/+ | |||
%ignore " " | |||
""" | |||
%ignore " " | |||
""" | |||
program = """c b r""" | |||
program = """c b r""" | |||
l = Lark(grammar, parser='earley', lexer='dynamic') | |||
l.parse(program) | |||
l = Lark(grammar, parser='earley', lexer=LEXER) | |||
l.parse(program) | |||
def test_earley_scanless3(self): | |||
"Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||
def test_earley_scanless3(self): | |||
"Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||
grammar = """ | |||
start: A A | |||
A: "a"+ | |||
""" | |||
grammar = """ | |||
start: A A | |||
A: "a"+ | |||
""" | |||
l = Lark(grammar, parser='earley', lexer='dynamic') | |||
res = l.parse("aaa") | |||
self.assertEqual(res.children, ['aa', 'a']) | |||
l = Lark(grammar, parser='earley', lexer=LEXER) | |||
res = l.parse("aaa") | |||
self.assertEqual(res.children, ['aa', 'a']) | |||
def test_earley_scanless4(self): | |||
grammar = """ | |||
start: A A? | |||
A: "a"+ | |||
""" | |||
def test_earley_scanless4(self): | |||
grammar = """ | |||
start: A A? | |||
A: "a"+ | |||
""" | |||
l = Lark(grammar, parser='earley', lexer=LEXER) | |||
res = l.parse("aaa") | |||
self.assertEqual(res.children, ['aaa']) | |||
l = Lark(grammar, parser='earley', lexer='dynamic') | |||
res = l.parse("aaa") | |||
self.assertEqual(res.children, ['aaa']) | |||
def test_earley_repeating_empty(self): | |||
# This was a sneaky bug! | |||
def test_earley_repeating_empty(self): | |||
# This was a sneaky bug! | |||
grammar = """ | |||
!start: "a" empty empty "b" | |||
empty: empty2 | |||
empty2: | |||
""" | |||
grammar = """ | |||
!start: "a" empty empty "b" | |||
empty: empty2 | |||
empty2: | |||
""" | |||
parser = Lark(grammar, parser='earley', lexer=LEXER) | |||
res = parser.parse('ab') | |||
parser = Lark(grammar, parser='earley', lexer='dynamic') | |||
res = parser.parse('ab') | |||
empty_tree = Tree('empty', [Tree('empty2', [])]) | |||
self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) | |||
empty_tree = Tree('empty', [Tree('empty2', [])]) | |||
self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) | |||
def test_earley_explicit_ambiguity(self): | |||
# This was a sneaky bug! | |||
def test_earley_explicit_ambiguity(self): | |||
# This was a sneaky bug! | |||
grammar = """ | |||
start: a b | ab | |||
a: "a" | |||
b: "b" | |||
ab: "ab" | |||
""" | |||
grammar = """ | |||
start: a b | ab | |||
a: "a" | |||
b: "b" | |||
ab: "ab" | |||
""" | |||
parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit') | |||
res = parser.parse('ab') | |||
parser = Lark(grammar, parser='earley', lexer='dynamic', ambiguity='explicit') | |||
res = parser.parse('ab') | |||
self.assertEqual( res.data, '_ambig') | |||
self.assertEqual( len(res.children), 2) | |||
self.assertEqual( res.data, '_ambig') | |||
self.assertEqual( len(res.children), 2) | |||
_NAME = "TestFullEarley" + (LEXER or 'Scanless').capitalize() | |||
_TestFullEarley.__name__ = _NAME | |||
globals()[_NAME] = _TestFullEarley | |||
def _make_parser_test(LEXER, PARSER): | |||
@@ -444,7 +449,7 @@ def _make_parser_test(LEXER, PARSER): | |||
""") | |||
x = g.parse('aababc') | |||
@unittest.skipIf(LEXER is None, "Known bug with scanless parsing") # TODO | |||
@unittest.skipIf(LEXER in (None, 'dynamic'), "Known bug with scanless parsing") # TODO | |||
def test_token_not_anon(self): | |||
"""Tests that "a" is matched as A, rather than an anonymous token. | |||
@@ -664,6 +669,8 @@ _TO_TEST = [ | |||
for _LEXER, _PARSER in _TO_TEST: | |||
_make_parser_test(_LEXER, _PARSER) | |||
for _LEXER in (None, 'dynamic'): | |||
_make_full_earley_test(_LEXER) | |||
if __name__ == '__main__': | |||
unittest.main() | |||