From dda0719375a63613039c2d7fe0a216577911ac0d Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 4 Aug 2017 15:32:39 +0300 Subject: [PATCH] Post-merge cleanup --- examples/json_parser.py | 10 +- lark/parser_frontends.py | 72 +-------------- lark/parsers/earley.py | 6 +- lark/parsers/xearley.py | 195 +++------------------------------------ tests/__main__.py | 15 ++- tests/test_parser.py | 137 ++++++++++++++------------- 6 files changed, 110 insertions(+), 325 deletions(-) diff --git a/examples/json_parser.py b/examples/json_parser.py index 56d6a0b..4f5feaf 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -47,12 +47,12 @@ class TreeToJson(Transformer): true = lambda self, _: True false = lambda self, _: False -json_parser = Lark(json_grammar, parser='earley', lexer='dynamic') -def parse(x): - return TreeToJson().transform(json_parser.parse(x)) +# json_parser = Lark(json_grammar, parser='earley', lexer='standard') +# def parse(x): +# return TreeToJson().transform(json_parser.parse(x)) -# json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) -# parse = json_parser.parse +json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) +parse = json_parser.parse def test(): test_json = ''' diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e0a7578..46f75b4 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -4,7 +4,7 @@ import sre_parse from .lexer import Lexer, ContextualLexer, Token from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token -from .parsers import lalr_parser, old_earley, nearley, earley +from .parsers import lalr_parser, earley from .tree import Transformer from .parsers import xearley @@ -49,47 +49,6 @@ class LALR_ContextualLexer: tokens = self.lexer_conf.postlex.process(tokens) return self.parser.parse(tokens, self.lexer.set_parser_state) - - -class Nearley(WithLexer): - def __init__(self, lexer_conf, parser_conf): - WithLexer.__init__(self, lexer_conf) - - rules = [{'name':n, - 'symbols': self._prepare_expansion(x), - 'postprocess': getattr(parser_conf.callback, a)} - for n,x,a in parser_conf.rules] - - self.parser = nearley.Parser(rules, parser_conf.start) - - def _prepare_expansion(self, expansion): - return [(sym, None) if is_terminal(sym) else sym for sym in expansion] - - def parse(self, text): - tokens = list(self.lex(text)) - res = self.parser.parse(tokens) - assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' - return res[0] - - -class OldEarley(WithLexer): - def __init__(self, lexer_conf, parser_conf): - WithLexer.__init__(self, lexer_conf) - - rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] - - self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) - - def _prepare_expansion(self, expansion): - return [(sym,) if is_terminal(sym) else sym for sym in expansion] - - def parse(self, text): - tokens = list(self.lex(text)) - res = self.parser.parse(tokens) - assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' - return res[0] - - def tokenize_text(text): new_text = [] line = 1 @@ -101,32 +60,6 @@ def tokenize_text(text): new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos)) return new_text - -class OldEarley_NoLex: - def __init__(self, lexer_conf, parser_conf): - self.token_by_name = {t.name:t for t in lexer_conf.tokens} - - rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] - - self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) - - def _prepare_expansion(self, expansion): - for sym in expansion: - if is_terminal(sym): - regexp = self.token_by_name[sym].pattern.to_regexp() - width = sre_parse.parse(regexp).getwidth() - if width != (1,1): - raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) - yield (re.compile(regexp).match, regexp) - else: - yield sym - - def parse(self, text): - new_text = tokenize_text(text) - res = self.parser.parse(new_text) - assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' - return res[0] - class Earley_NoLex: def __init__(self, lexer_conf, parser_conf, options=None): self.token_by_name = {t.name:t for t in lexer_conf.tokens} @@ -178,7 +111,7 @@ class XEarley: def __init__(self, lexer_conf, parser_conf, options=None): self.token_by_name = {t.name:t for t in lexer_conf.tokens} - rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] + rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] resolve_ambiguity = (options.ambiguity=='resolve') if options else True ignore = [Terminal_Regexp(self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] @@ -195,6 +128,7 @@ class XEarley: if is_terminal(sym): regexp = self.token_by_name[sym].pattern.to_regexp() width = sre_parse.parse(regexp).getwidth() + assert width yield Terminal_Regexp(regexp) else: yield sym diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index dbcbda3..9755b99 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -238,13 +238,13 @@ def _compare_rules(rule1, rule2): def _compare_drv(tree1, tree2): if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): - return compare(tree1, tree2) + return -compare(tree1, tree2) try: rule1, rule2 = tree1.rule, tree2.rule except AttributeError: # Probably trees that don't take part in this parse (better way to distinguish?) - return compare(tree1, tree2) + return -compare(tree1, tree2) # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, # when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be @@ -264,7 +264,7 @@ def _compare_drv(tree1, tree2): if c: return c - return compare(len(tree1.children), len(tree2.children)) + return -compare(len(tree1.children), len(tree2.children)) def _resolve_ambig(tree): diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index cd34bfc..7bffbb7 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -1,4 +1,4 @@ -"This module implements an Earley Parser" +"This module implements an experimental Earley Parser with a dynamic lexer" # The parser uses a parse-forest to keep track of derivations and ambiguations. # When the parse ends successfully, a disambiguation stage resolves all ambiguity @@ -10,121 +10,21 @@ # The algorithm keeps track of each state set, using a corresponding Column instance. # Column keeps track of new items using NewsList instances. # +# Instead of running a lexer beforehand, or using a costy char-by-char method, this parser +# uses regular expressions by necessity, achieving high-performance while maintaining all of +# Earley's power in parsing any CFG. +# +# # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from functools import cmp_to_key from collections import defaultdict -from ..utils import compare from ..common import ParseError, UnexpectedToken, Terminal -from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse +from ..tree import Tree from .grammar_analysis import GrammarAnalyzer - -class EndToken: - type = '$end' - -class Derivation(Tree): - def __init__(self, rule, items=None): - Tree.__init__(self, 'drv', items or []) - self.rule = rule - -END_TOKEN = EndToken() - -class Item(object): - "An Earley Item, the atom of the algorithm." - - def __init__(self, rule, ptr, start, tree): - self.rule = rule - self.ptr = ptr - self.start = start - self.tree = tree if tree is not None else Derivation(self.rule) - - @property - def expect(self): - return self.rule.expansion[self.ptr] - - @property - def is_complete(self): - return self.ptr == len(self.rule.expansion) - - def advance(self, tree): - assert self.tree.data == 'drv' - new_tree = Derivation(self.rule, self.tree.children + [tree]) - return Item(self.rule, self.ptr+1, self.start, new_tree) - - def __eq__(self, other): - return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule - def __hash__(self): - return hash((self.rule, self.ptr, id(self.start))) - - def __repr__(self): - before = list(map(str, self.rule.expansion[:self.ptr])) - after = list(map(str, self.rule.expansion[self.ptr:])) - return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) - - -class NewsList(list): - "Keeps track of newly added items (append-only)" - - def __init__(self, initial=None): - list.__init__(self, initial or []) - self.last_iter = 0 - - def get_news(self): - i = self.last_iter - self.last_iter = len(self) - return self[i:] - - - -class Column: - "An entry in the table, aka Earley Chart. Contains lists of items." - def __init__(self, i): - self.i = i - self.to_reduce = NewsList() - self.to_predict = NewsList() - self.to_scan = NewsList() - self.item_count = 0 - - self.added = set() - self.completed = {} - - def add(self, items): - """Sort items into scan/predict/reduce newslists - - Makes sure only unique items are added. - """ - - for item in items: - - if item.is_complete: - # XXX Potential bug: What happens if there's ambiguity in an empty rule? - if item.rule.expansion and item in self.completed: - old_tree = self.completed[item].tree - if old_tree.data != '_ambig': - new_tree = old_tree.copy() - new_tree.rule = old_tree.rule - old_tree.set('_ambig', [new_tree]) - if item.tree.children[0] is old_tree: # XXX a little hacky! - raise ParseError("Infinite recursion in grammar!") - old_tree.children.append(item.tree) - else: - self.completed[item] = item - self.to_reduce.append(item) - else: - if item not in self.added: - self.added.add(item) - if isinstance(item.expect, Terminal): - self.to_scan.append(item) - else: - self.to_predict.append(item) - - self.item_count += 1 # Only count if actually added - - def __nonzero__(self): - return bool(self.item_count) +from earley import ResolveAmbig, ApplyCallbacks, Item, NewsList, Derivation, END_TOKEN, Column class Parser: def __init__(self, rules, start_symbol, callback, resolve_ambiguity=True, ignore=()): @@ -144,7 +44,7 @@ class Parser: def parse(self, stream, start_symbol=None): # Define parser functions start_symbol = start_symbol or self.start_symbol - matched_terminals = defaultdict(list) + delayed_matches = defaultdict(list) def predict(nonterm, column): assert not isinstance(nonterm, Terminal), nonterm @@ -178,16 +78,17 @@ class Parser: for item in to_scan: m = item.expect.match(stream, i) if m: - matched_terminals[m.end()].append(item.advance(m.group(0))) + delayed_matches[m.end()].append(item.advance(m.group(0))) s = m.group(0) for j in range(1, len(s)): m = item.expect.match(s[:-j]) if m: - matched_terminals[m.end()].append(item.advance(m.group(0))) + delayed_matches[m.end()].append(item.advance(m.group(0))) next_set = Column(i+1) - next_set.add(matched_terminals[i+1]) + next_set.add(delayed_matches[i+1]) + del delayed_matches[i+1] # No longer needed, so unburden memory return next_set @@ -220,73 +121,3 @@ class Parser: return ApplyCallbacks(self.postprocess).transform(tree) - -class ApplyCallbacks(Transformer_NoRecurse): - def __init__(self, postprocess): - self.postprocess = postprocess - - def drv(self, tree): - children = tree.children - callback = self.postprocess[tree.rule] - if callback: - return callback(children) - else: - return Tree(rule.origin, children) - -def _compare_rules(rule1, rule2): - assert rule1.origin == rule2.origin - c = compare( len(rule1.expansion), len(rule2.expansion)) - if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here - c = -c - return c - -def _compare_drv(tree1, tree2): - if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): - return -compare(tree1, tree2) - - c = _compare_rules(tree1.rule, tree2.rule) - if c: - return c - - # rules are "equal", so compare trees - for t1, t2 in zip(tree1.children, tree2.children): - c = _compare_drv(t1, t2) - if c: - return c - - return -compare(len(tree1.children), len(tree2.children)) - - -class ResolveAmbig(Visitor_NoRecurse): - """Resolves ambiguity in resulting parse tree. - - Minimizes rule length, maximizes match length. - """ - - def _ambig(self, tree): - best = min(tree.children, key=cmp_to_key(_compare_drv)) - assert best.data == 'drv' - tree.set('drv', best.children) - tree.rule = best.rule # needed for applying callbacks - - -# RULES = [ -# ('a', ['d']), -# ('d', ['b']), -# ('b', ['C']), -# ('b', ['b', 'C']), -# ('b', ['C', 'b']), -# ] -# p = Parser(RULES, 'a') -# for x in p.parse('CC'): -# print x.pretty() - -#--------------- -# RULES = [ -# ('s', ['a', 'a']), -# ('a', ['b', 'b']), -# ('b', ['C'], lambda (x,): x), -# ('b', ['b', 'C']), -# ] -# p = Parser(RULES, 's', {}) -# print p.parse('CCCCC').pretty() diff --git a/tests/__main__.py b/tests/__main__.py index 1811f81..df1c65e 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -4,10 +4,23 @@ import unittest import logging from .test_trees import TestTrees + # from .test_selectors import TestSelectors -from .test_parser import TestLalrStandard, TestEarleyStandard, TestLalrContextual, TestParsers, TestEarleyScanless, TestEarley, TestEarleyDynamic # from .test_grammars import TestPythonG, TestConfigG +from .test_parser import ( + TestLalrStandard, + TestEarleyStandard, + TestLalrContextual, + TestEarleyScanless, + TestEarleyDynamic, + + TestFullEarleyScanless, + TestFullEarleyDynamic, + + TestParsers, + ) + logging.basicConfig(level=logging.INFO) if __name__ == '__main__': diff --git a/tests/test_parser.py b/tests/test_parser.py index 0097b92..363e042 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -51,90 +51,95 @@ class TestParsers(unittest.TestCase): self.assertRaises(ParseError, l.parse, 'a') -class TestEarley(unittest.TestCase): - def test_anon_in_scanless(self): - # Fails an Earley implementation without special handling for empty rules, - # or re-processing of already completed rules. - g = Lark(r"""start: B - B: ("ab"|/[^b]/)* - """, lexer='dynamic') +def _make_full_earley_test(LEXER): + class _TestFullEarley(unittest.TestCase): + def test_anon_in_scanless(self): + # Fails an Earley implementation without special handling for empty rules, + # or re-processing of already completed rules. + g = Lark(r"""start: B + B: ("ab"|/[^b]/)* + """, lexer=LEXER) - self.assertEqual( g.parse('abc').children[0], 'abc') + self.assertEqual( g.parse('abc').children[0], 'abc') - def test_earley_scanless(self): - g = Lark("""start: A "b" c - A: "a"+ - c: "abc" - """, parser="earley", lexer='dynamic') - x = g.parse('aaaababc') + def test_earley_scanless(self): + g = Lark("""start: A "b" c + A: "a"+ + c: "abc" + """, parser="earley", lexer=LEXER) + x = g.parse('aaaababc') - def test_earley_scanless2(self): - grammar = """ - start: statement+ + def test_earley_scanless2(self): + grammar = """ + start: statement+ - statement: "r" - | "c" /[a-z]/+ + statement: "r" + | "c" /[a-z]/+ - %ignore " " - """ + %ignore " " + """ - program = """c b r""" + program = """c b r""" - l = Lark(grammar, parser='earley', lexer='dynamic') - l.parse(program) + l = Lark(grammar, parser='earley', lexer=LEXER) + l.parse(program) - def test_earley_scanless3(self): - "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" + def test_earley_scanless3(self): + "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" - grammar = """ - start: A A - A: "a"+ - """ + grammar = """ + start: A A + A: "a"+ + """ - l = Lark(grammar, parser='earley', lexer='dynamic') - res = l.parse("aaa") - self.assertEqual(res.children, ['aa', 'a']) + l = Lark(grammar, parser='earley', lexer=LEXER) + res = l.parse("aaa") + self.assertEqual(res.children, ['aa', 'a']) - def test_earley_scanless4(self): - grammar = """ - start: A A? - A: "a"+ - """ + def test_earley_scanless4(self): + grammar = """ + start: A A? + A: "a"+ + """ + + l = Lark(grammar, parser='earley', lexer=LEXER) + res = l.parse("aaa") + self.assertEqual(res.children, ['aaa']) - l = Lark(grammar, parser='earley', lexer='dynamic') - res = l.parse("aaa") - self.assertEqual(res.children, ['aaa']) + def test_earley_repeating_empty(self): + # This was a sneaky bug! - def test_earley_repeating_empty(self): - # This was a sneaky bug! + grammar = """ + !start: "a" empty empty "b" + empty: empty2 + empty2: + """ - grammar = """ - !start: "a" empty empty "b" - empty: empty2 - empty2: - """ + parser = Lark(grammar, parser='earley', lexer=LEXER) + res = parser.parse('ab') - parser = Lark(grammar, parser='earley', lexer='dynamic') - res = parser.parse('ab') + empty_tree = Tree('empty', [Tree('empty2', [])]) + self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) - empty_tree = Tree('empty', [Tree('empty2', [])]) - self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) + def test_earley_explicit_ambiguity(self): + # This was a sneaky bug! - def test_earley_explicit_ambiguity(self): - # This was a sneaky bug! + grammar = """ + start: a b | ab + a: "a" + b: "b" + ab: "ab" + """ - grammar = """ - start: a b | ab - a: "a" - b: "b" - ab: "ab" - """ + parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit') + res = parser.parse('ab') - parser = Lark(grammar, parser='earley', lexer='dynamic', ambiguity='explicit') - res = parser.parse('ab') + self.assertEqual( res.data, '_ambig') + self.assertEqual( len(res.children), 2) - self.assertEqual( res.data, '_ambig') - self.assertEqual( len(res.children), 2) + _NAME = "TestFullEarley" + (LEXER or 'Scanless').capitalize() + _TestFullEarley.__name__ = _NAME + globals()[_NAME] = _TestFullEarley def _make_parser_test(LEXER, PARSER): @@ -444,7 +449,7 @@ def _make_parser_test(LEXER, PARSER): """) x = g.parse('aababc') - @unittest.skipIf(LEXER is None, "Known bug with scanless parsing") # TODO + @unittest.skipIf(LEXER in (None, 'dynamic'), "Known bug with scanless parsing") # TODO def test_token_not_anon(self): """Tests that "a" is matched as A, rather than an anonymous token. @@ -664,6 +669,8 @@ _TO_TEST = [ for _LEXER, _PARSER in _TO_TEST: _make_parser_test(_LEXER, _PARSER) +for _LEXER in (None, 'dynamic'): + _make_full_earley_test(_LEXER) if __name__ == '__main__': unittest.main()