diff --git a/examples/json_parser.py b/examples/json_parser.py new file mode 100644 index 0000000..485ea52 --- /dev/null +++ b/examples/json_parser.py @@ -0,0 +1,64 @@ +import sys +from lark.lark import Lark, inline_args +from lark.tree import Transformer + +json_grammar = r""" + ?start: value + + ?value: object + | array + | string + | number + | "true" -> true + | "false" -> false + | "null" -> null + + array : "[" [value ("," value)*] "]" + object : "{" [pair ("," pair)*] "}" + pair : string ":" value + + number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ + string : /".*?(?' : 'MORETHAN', '=' : 'EQUAL', - '.' : 'DOT', + '.' : '_DOT', '%' : 'PERCENT', '`' : 'BACKQUOTE', '^' : 'CIRCUMFLEX', @@ -34,8 +36,8 @@ _TOKEN_NAMES = { '\'' : 'QUOTE', '~' : 'TILDE', '@' : 'AT', - '(' : 'LPAR', - ')' : 'RPAR', + '(' : '_LPAR', + ')' : '_RPAR', '{' : 'LBRACE', '}' : 'RBRACE', '[' : 'LSQB', @@ -44,151 +46,58 @@ _TOKEN_NAMES = { # Grammar Parser TOKENS = { - 'LPAR': '\(', - 'RPAR': '\)', - 'LBRA': '\[', - 'RBRA': '\]', + '_LPAR': '\(', + '_RPAR': '\)', + '_LBRA': '\[', + '_RBRA': '\]', 'OP': '[+*?]', - 'COLON': ':', - 'OR': '\|', - 'DOT': '\.', + '_COLON': ':', + '_OR': '\|', + '_DOT': '\.', 'RULE': '[_?*]?[a-z][_a-z0-9]*', 'TOKEN': '_?[A-Z][_A-Z0-9]*', 'STRING': r'".*?[^\\]"', 'REGEXP': r"/(.|\n)*?[^\\]/", - 'NL': r'(\r?\n)+\s*', + '_NL': r'(\r?\n)+\s*', 'WS': r'[ \t]+', 'COMMENT': r'//[^\n]*\n', - 'TO': '->' + '_TO': '->' } -RULES = [ - ('start', ['list']), - ('list', ['item']), - ('list', ['list', 'item']), - ('item', ['rule']), - ('item', ['token']), - ('item', ['NL']), - - ('rule', ['RULE', 'COLON', 'expansions', 'NL']), - ('expansions', ['expansion']), - ('expansions', ['expansions', 'OR', 'expansion']), - ('expansions', ['expansions', 'NL', 'OR', 'expansion']), - - ('expansion', ['_expansion']), - ('expansion', ['_expansion', 'TO', 'RULE']), +RULES = { + 'start': ['list'], + 'list': ['item', 'list item'], + 'item': ['rule', 'token', '_NL'], - ('_expansion', []), - ('_expansion', ['_expansion', 'expr']), + 'rule': ['RULE _COLON expansions _NL'], + 'expansions': ['expansion', + 'expansions _OR expansion', + 'expansions _NL _OR expansion'], - ('expr', ['atom']), - ('expr', ['atom', 'OP']), + 'expansion': ['_expansion', + '_expansion _TO RULE'], - ('atom', ['LPAR', 'expansions', 'RPAR']), - ('atom', ['maybe']), + '_expansion': ['', '_expansion expr'], - ('atom', ['RULE']), - ('atom', ['TOKEN']), - ('atom', ['anontoken']), + '?expr': ['atom', + 'atom OP'], - ('anontoken', ['tokenvalue']), + '?atom': ['_LPAR expansions _RPAR', + 'maybe', + 'RULE', + 'TOKEN', + 'anontoken'], - ('maybe', ['LBRA', 'expansions', 'RBRA']), + 'anontoken': ['tokenvalue'], - ('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']), - ('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']), - ('tokenvalue', ['REGEXP']), - ('tokenvalue', ['STRING']), - ('tokenmods', ['DOT', 'RULE']), - ('tokenmods', ['tokenmods', 'DOT', 'RULE']), -] - -class SaveDefinitions(object): - def __init__(self): - self.rules = {} - self.token_set = set() - self.tokens = [] - self.i = 0 + 'maybe': ['_LBRA expansions _RBRA'], + 'token': ['TOKEN _COLON tokenvalue _NL', + 'TOKEN tokenmods _COLON tokenvalue _NL'], - def atom__3(self, _1, value, _2): - return value - def atom__1(self, value): - return value - - def expr__1(self, expr): - return expr - def expr(self, *x): - return T('expr', x) - - def expansion__1(self, expansion): - return expansion - def expansion__3(self, expansion, _, alias): - return T('alias', [expansion, alias]) - def _expansion(self, *x): - return T('expansion', x) - - def expansions(self, *x): - items = [i for i in x if isinstance(i, T)] - return T('expansions', items) - - def maybe(self, _1, expr, _2): - return T('expr', [expr, Token('OP', '?', -1)]) - - def rule(self, name, _1, expansion, _2): - name = name.value - if name in self.rules: - raise ValueError("Rule '%s' defined more than once" % name) - - self.rules[name] = expansion - - def token(self, *x): - name = x[0].value - if name in self.token_set: - raise ValueError("Token '%s' defined more than once" % name) - self.token_set.add(name) - - if len(x) == 4: - self.tokens.append((name, x[2], [])) - else: - self.tokens.append((name, x[3], x[1].children)) - - def tokenvalue(self, tokenvalue): - return tokenvalue - - def anontoken(self, token): - if token.type == 'STRING': - value = token.value[1:-1] - try: - token_name = _TOKEN_NAMES[value] - except KeyError: - if value.isalnum() and value[0].isalpha(): - token_name = value.upper() - else: - token_name = 'ANONSTR_%d' % self.i - self.i += 1 - token_name = '__' + token_name - - elif token.type == 'REGEXP': - token_name = 'ANONRE_%d' % self.i - self.i += 1 - else: - assert False, x - - if token_name not in self.token_set: - self.token_set.add(token_name) - self.tokens.append((token_name, token, [])) - - return Token('TOKEN', token_name, -1) - - def tokenmods__2(self, _, rule): - return T('tokenmods', [rule.value]) - def tokenmods__3(self, tokenmods, _, rule): - return T('tokenmods', tokenmods.children + [rule.value]) - - def start(self, *x): pass - def list(self, *x): pass - def item(self, *x): pass + '?tokenvalue': ['REGEXP', 'STRING'], + 'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'], +} class EBNF_to_BNF(InlineTransformer): @@ -281,46 +190,110 @@ def dict_update_safe(d1, d2): d1[k] = v -def generate_aliases(): - sd = SaveDefinitions() - for name, expansion in RULES: - try: - f = getattr(sd, "%s__%s" % (name, len(expansion))) - except AttributeError: - f = getattr(sd, name) - yield name, expansion, f.__name__ +class RuleTreeToText(Transformer): + def expansions(self, x): + return x + def expansion(self, symbols): + return [sym.value for sym in symbols], None + def alias(self, ((expansion, _alias), alias)): + assert _alias is None, (alias, expansion, '-', _alias) + return expansion, alias.value + + +class SimplifyTree(InlineTransformer): + def maybe(self, expr): + return T('expr', [expr, Token('OP', '?', -1)]) + + def tokenmods(self, *args): + if len(args) == 1: + return list(args) + tokenmods, value = args + return tokenmods + [value] + +def get_tokens(tree, token_set): + tokens = [] + for t in tree.find_data('token'): + x = t.children + name = x[0].value + assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name + if name in token_set: + raise ValueError("Token '%s' defined more than once" % name) + token_set.add(name) + + if len(x) == 2: + yield name, x[1], [] + else: + assert len(x) == 3 + yield name, x[2], x[1] + +class ExtractAnonTokens(InlineTransformer): + def __init__(self, tokens, token_set): + self.tokens = tokens + self.token_set = token_set + self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens} + + def anontoken(self, token): + if token.type == 'STRING': + value = token.value[1:-1] + try: + # If already defined, use the user-defined token name + token_name = self.token_reverse[value] + except KeyError: + # Try to assign an indicative anon-token name, otherwise use a numbered name + try: + token_name = _TOKEN_NAMES[value] + except KeyError: + if value.isalnum() and value[0].isalpha(): + token_name = value.upper() + else: + token_name = 'ANONSTR_%d' % self.i + self.i += 1 + token_name = '__' + token_name + + elif token.type == 'REGEXP': + token_name = 'ANONRE_%d' % self.i + self.i += 1 + else: + assert False, x + + if token_name not in self.token_set: + self.token_set.add(token_name) + self.tokens.append((token_name, token, [])) + + return Token('TOKEN', token_name, -1) -def inline_args(f): - def _f(self, args): - return f(*args) - return _f class GrammarLoader: def __init__(self): - self.rules = list(generate_aliases()) - self.ga = GrammarAnalyzer(self.rules) - self.ga.analyze() self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) - self.simplify_rule = SimplifyRule_Visitor() - def _generate_parser_callbacks(self, callbacks): - d = {alias: inline_args(getattr(callbacks, alias)) - for _n, _x, alias in self.rules} - return type('Callback', (), d)() + d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()} + rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None) + self.parser = LALR().build_parser(rules, callback, 'start') + + self.simplify_tree = SimplifyTree() + self.simplify_rule = SimplifyRule_Visitor() + self.rule_tree_to_text = RuleTreeToText() def load_grammar(self, grammar_text): - sd = SaveDefinitions() - c = self._generate_parser_callbacks(sd) - p = Parser(self.ga, c) - p.parse( list(self.lexer.lex(grammar_text+"\n")) ) + token_stream = list(self.lexer.lex(grammar_text+"\n")) + tree = self.simplify_tree.transform( self.parser.parse(token_stream) ) + + # ================= + # Process Tokens + # ================= + + token_set = set() + tokens = list(get_tokens(tree, token_set)) + extract_anon = ExtractAnonTokens(tokens, token_set) + tree = extract_anon.transform(tree) # Adds to tokens - # Tokens token_ref = {} re_tokens = [] str_tokens = [] - for name, token, flags in sd.tokens: + for name, token, flags in tokens: value = token.value[1:-1] if '\u' in value: # XXX for now, you can't mix unicode escaping and unicode characters at the same token @@ -343,43 +316,70 @@ class GrammarLoader: re_tokens.sort(key=lambda x:len(x[1]), reverse=True) tokens = str_tokens + re_tokens # Order is important! - # Rules + # ================= + # Process Rules + # ================= + ebnf_to_bnf = EBNF_to_BNF() - rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()} + rules = {} + for rule in tree.find_data('rule'): + name, ebnf_tree = rule.children + name = name.value + if name in rules: + raise ValueError("Rule '%s' defined more than once" % name) + + rules[name] = ebnf_to_bnf.transform(ebnf_tree) + dict_update_safe(rules, ebnf_to_bnf.new_rules) for r in rules.values(): self.simplify_rule.visit(r) + rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()} + + + # ==================== + # Verify correctness + # ==================== + used_symbols = {symbol for expansions in rules.values() + for expansion, _alias in expansions + for symbol in expansion} + rule_set = {r.lstrip('?') for r in rules} + for sym in used_symbols: + if is_terminal(sym): + if sym not in token_set: + raise GrammarError("Token '%s' used but not defined" % sym) + else: + if sym not in rule_set: + raise GrammarError("Rule '%s' used but not defined" % sym) + return tokens, rules load_grammar = GrammarLoader().load_grammar + def test(): g = """ start: add - # Rules + // Rules add: mul | add _add_sym mul - mul: _atom - | mul _add_mul _atom + mul: [mul _add_mul] _atom - neg: "-" _atom - - _atom: neg - | number + _atom: "-" _atom -> neg + | NUMBER | "(" add ")" - # Tokens - number: /[\d.]+/ + // Tokens + NUMBER: /[\d.]+/ _add_sym: "+" | "-" _add_mul: "*" | "/" - WS.ignore: /\s+/ + WS.ignore.newline: /\s+/ """ g2 = """ @@ -389,7 +389,9 @@ def test(): c: "c" d: "+" | "-" """ - load_grammar(g) - + # print load_grammar(g) + print GrammarLoader().load_grammar2(g) +if __name__ == '__main__': + test() diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py new file mode 100644 index 0000000..0d3de49 --- /dev/null +++ b/lark/parse_tree_builder.py @@ -0,0 +1,76 @@ +from .grammar_analysis import is_terminal + +class Callback(object): + pass + + +def create_expand1_tree_builder_function(tree_builder): + def f(children): + if len(children) == 1: + return children[0] + else: + return tree_builder(children) + return f + +def create_rule_handler(expansion, usermethod): + to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) + if not (is_terminal(sym) and sym.startswith('_'))] + + def _build_ast(match): + children = [] + for i, to_expand in to_include: + if to_expand: + children += match[i].children + else: + children.append(match[i]) + + return usermethod(children) + return _build_ast + + +class ParseTreeBuilder: + def __init__(self, tree_class): + self.tree_class = tree_class + + def _create_tree_builder_function(self, name): + tree_class = self.tree_class + def f(children): + return tree_class(name, children) + return f + + + def create_tree_builder(self, rules, transformer): + callback = Callback() + new_rules = [] + for origin, expansions in rules.items(): + expand1 = origin.startswith('?') + _origin = origin.lstrip('?*') + + for expansion, alias in expansions: + if alias and origin.startswith('_'): + raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) + + if alias: + alias = alias.lstrip('*') + _alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) + + try: + f = transformer._get_func(alias or _origin) + except AttributeError: + if alias: + f = self._create_tree_builder_function(alias) + else: + f = self._create_tree_builder_function(_origin) + if expand1: + f = create_expand1_tree_builder_function(f) + + alias_handler = create_rule_handler(expansion, f) + + assert not hasattr(callback, _alias) + setattr(callback, _alias, alias_handler) + + new_rules.append(( _origin, expansion, _alias )) + + return new_rules, callback + + diff --git a/lark/parser.py b/lark/parser.py index 6907386..37edf18 100644 --- a/lark/parser.py +++ b/lark/parser.py @@ -34,7 +34,7 @@ class Parser(object): res = self.callbacks[rule]([x[0] for x in s]) - if rule.origin == 'start': + if rule.origin == self.ga.start_symbol and len(stack) == 1: return res _action, new_state = get_action(rule.origin) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py new file mode 100644 index 0000000..5f713de --- /dev/null +++ b/lark/parser_frontends.py @@ -0,0 +1,31 @@ +from .grammar_analysis import GrammarAnalyzer +from common import is_terminal +from . import parser, earley + +class LALR: + def build_parser(self, rules, callback, start): + ga = GrammarAnalyzer(rules, start) + ga.analyze() + return parser.Parser(ga, callback) + +class Earley: + @staticmethod + def _process_expansion(x): + return [{'literal': s} if is_terminal(s) else s for s in x] + + def build_parser(self, rules, callback, start): + rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] + return EarleyParser(earley.Parser(rules, start)) + +class EarleyParser: + def __init__(self, parser): + self.parser = parser + + def parse(self, text): + res = self.parser.parse(text) + assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' + return res[0] + + +ENGINE_DICT = { 'lalr': LALR, 'earley': Earley } + diff --git a/lark/tests/__init__.py b/lark/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lark/tests/__main__.py b/lark/tests/__main__.py new file mode 100644 index 0000000..de20f7a --- /dev/null +++ b/lark/tests/__main__.py @@ -0,0 +1,14 @@ +from __future__ import absolute_import, print_function + +import unittest +import logging + +from .test_trees import TestTrees +# from .test_selectors import TestSelectors +from .test_parser import TestLalr +# from .test_grammars import TestPythonG, TestConfigG + +logging.basicConfig(level=logging.INFO) + +if __name__ == '__main__': + unittest.main() diff --git a/lark/tests/test_parser.py b/lark/tests/test_parser.py new file mode 100644 index 0000000..e9d6e01 --- /dev/null +++ b/lark/tests/test_parser.py @@ -0,0 +1,326 @@ +from __future__ import absolute_import + +import unittest +import logging +import os +import sys +try: + from cStringIO import StringIO as cStringIO +except ImportError: + # Available only in Python 2.x, 3.x only has io.StringIO from below + cStringIO = None +from io import ( + StringIO as uStringIO, + open, + ) + +logging.basicConfig(level=logging.INFO) + +from lark.lark import Lark +from lark.grammar_analysis import GrammarError +from lark.parser import ParseError + +__path__ = os.path.dirname(__file__) +def _read(n, *args): + with open(os.path.join(__path__, n), *args) as f: + return f.read() + + +class TestLalr(unittest.TestCase): + def test_basic1(self): + g = Lark("""start: a+ b a* "b" a* + b: "b" + a: "a" + """, parser='lalr') + r = g.parse('aaabaab') + self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' ) + r = g.parse('aaabaaba') + self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' ) + + self.assertRaises(ParseError, g.parse, 'aaabaa') + + def test_basic2(self): + # Multiple parsers and colliding tokens + g = Lark("""start: B A + B: "12" + A: "1" """) + g2 = Lark("""start: B A + B: "12" + A: "2" """) + x = g.parse('121') + assert x.data == 'start' and x.children == ['12', '1'], x + x = g2.parse('122') + assert x.data == 'start' and x.children == ['12', '2'], x + + def test_basic3(self): + "Tests that Earley and LALR parsers produce equal trees" + g = Lark("""start: "(" name_list ("," "*" NAME)? ")" + name_list: NAME | name_list "," NAME + NAME: /\w+/ """, parser='lalr') + l = g.parse('(a,b,c,*x)') + + g = Lark("""start: "(" name_list ("," "*" NAME)? ")" + name_list: NAME | name_list "," NAME + NAME: /\w+/ """) + l2 = g.parse('(a,b,c,*x)') + assert l == l2, '%s != %s' % (l.pretty(), l2.pretty()) + + @unittest.skipIf(cStringIO is None, "cStringIO not available") + def test_stringio_bytes(self): + """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object""" + Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" ')) + + def test_stringio_unicode(self): + """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object""" + Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" ')) + + def test_unicode(self): + g = Lark(u"""start: UNIA UNIB UNIA + UNIA: /\xa3/ + UNIB: /\u0101/ + """) + g.parse(u'\xa3\u0101\u00a3') + + def test_unicode2(self): + g = Lark(r"""start: UNIA UNIB UNIA UNIC + UNIA: /\xa3/ + UNIB: "a\u0101b\ " + UNIC: /a?\u0101c\n/ + """) + g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n') + + + def test_recurse_expansion(self): + """Verify that stack depth doesn't get exceeded on recursive rules marked for expansion.""" + g = Lark(r"""start: a | start a + a : "a" """) + + # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built + # STree data structures, which uses recursion). + g.parse("a" * (sys.getrecursionlimit() // 4)) + + def test_expand1_lists_with_one_item(self): + g = Lark(r"""start: list + ?list: item+ + item : A + A: "a" + """) + r = g.parse("a") + + # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item' + self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',)) + + # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule + self.assertEqual(len(r.children), 1) + + def test_expand1_lists_with_one_item_2(self): + g = Lark(r"""start: list + ?list: item+ "!" + item : A + A: "a" + """) + r = g.parse("a!") + + # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item' + self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',)) + + # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule + self.assertEqual(len(r.children), 1) + + def test_dont_expand1_lists_with_multiple_items(self): + g = Lark(r"""start: list + ?list: item+ + item : A + A: "a" + """) + r = g.parse("aa") + + # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded + self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) + + # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule + self.assertEqual(len(r.children), 1) + + # Sanity check: verify that 'list' contains the two 'item's we've given it + [list] = r.children + self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) + + def test_dont_expand1_lists_with_multiple_items_2(self): + g = Lark(r"""start: list + ?list: item+ "!" + item : A + A: "a" + """) + r = g.parse("aa!") + + # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded + self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) + + # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule + self.assertEqual(len(r.children), 1) + + # Sanity check: verify that 'list' contains the two 'item's we've given it + [list] = r.children + self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) + + + + def test_empty_expand1_list(self): + g = Lark(r"""start: list + ?list: item* + item : A + A: "a" + """) + r = g.parse("") + + # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded + self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) + + # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule + self.assertEqual(len(r.children), 1) + + # Sanity check: verify that 'list' contains no 'item's as we've given it none + [list] = r.children + self.assertSequenceEqual([item.data for item in list.children], ()) + + def test_empty_expand1_list_2(self): + g = Lark(r"""start: list + ?list: item* "!"? + item : A + A: "a" + """) + r = g.parse("") + + # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded + self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) + + # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule + self.assertEqual(len(r.children), 1) + + # Sanity check: verify that 'list' contains no 'item's as we've given it none + [list] = r.children + self.assertSequenceEqual([item.data for item in list.children], ()) + + + def test_empty_flatten_list(self): + g = Lark(r"""start: list + list: | item "," list + item : A + A: "a" + """) + r = g.parse("") + + # Because 'list' is a flatten rule it's top-level element should *never* be expanded + self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) + + # Sanity check: verify that 'list' contains no 'item's as we've given it none + [list] = r.children + self.assertSequenceEqual([item.data for item in list.children], ()) + + @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") + def test_single_item_flatten_list(self): + g = Lark(r"""start: list + list: | item "," list + item : A + A: "a" + """) + r = g.parse("a,") + + # Because 'list' is a flatten rule it's top-level element should *never* be expanded + self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) + + # Sanity check: verify that 'list' contains exactly the one 'item' we've given it + [list] = r.children + self.assertSequenceEqual([item.data for item in list.children], ('item',)) + + @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") + def test_multiple_item_flatten_list(self): + g = Lark(r"""start: list + #list: | item "," list + item : A + A: "a" + """) + r = g.parse("a,a,") + + # Because 'list' is a flatten rule it's top-level element should *never* be expanded + self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) + + # Sanity check: verify that 'list' contains exactly the two 'item's we've given it + [list] = r.children + self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) + + @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") + def test_recurse_flatten(self): + """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening.""" + g = Lark(r"""start: a | start a + a : A + A : "a" """) + + # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built + # STree data structures, which uses recursion). + g.parse("a" * (sys.getrecursionlimit() // 4)) + + def test_token_collision(self): + g = Lark("""start: "Hello" NAME + NAME: /\w+/ + WS.ignore: /\s+/ + """, parser='lalr') + x = g.parse('Hello World') + self.assertSequenceEqual(x.children, ['World']) + + def test_undefined_rule(self): + self.assertRaises(GrammarError, Lark, """start: a""", parser='lalr') + + def test_undefined_token(self): + self.assertRaises(GrammarError, Lark, """start: A""", parser='lalr') + + def test_rule_collision(self): + g = Lark("""start: "a"+ "b" + | "a"+ """, parser='lalr') + x = g.parse('aaaa') + x = g.parse('aaaab') + + def test_rule_collision2(self): + g = Lark("""start: "a"* "b" + | "a"+ """, parser='lalr') + x = g.parse('aaaa') + x = g.parse('aaaab') + x = g.parse('b') + + def test_regex_embed(self): + g = Lark("""start: A B C + A: /a/ + B: /${A}b/ + C: /${B}c/ + """, parser='lalr') + x = g.parse('aababc') + + def test_token_not_anon(self): + """Tests that "a" is matched as A, rather than an anonymous token. + + That means that "a" is not filtered out, despite being an 'immediate string'. + Whether or not this is the intuitive behavior, I'm not sure yet. + + -Erez + """ + + g = Lark("""start: "a" + A: "a" """, parser='lalr') + x = g.parse('a') + self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous') + self.assertEqual(x.children[0].type, "A") + + def test_maybe(self): + g = Lark("""start: ["a"] """, parser='lalr') + x = g.parse('a') + x = g.parse('') + + def test_start(self): + g = Lark("""a: "a" a? """, parser='lalr', start='a') + x = g.parse('a') + x = g.parse('aa') + x = g.parse('aaa') + +if __name__ == '__main__': + unittest.main() + diff --git a/lark/tests/test_trees.py b/lark/tests/test_trees.py new file mode 100644 index 0000000..6a8866f --- /dev/null +++ b/lark/tests/test_trees.py @@ -0,0 +1,26 @@ +from __future__ import absolute_import + +from unittest import TestCase +import logging +import copy +import pickle + +from lark.tree import Tree + + +class TestTrees(TestCase): + def setUp(self): + self.tree1 = Tree('a', [Tree(x, y) for x, y in zip('bcd', 'xyz')]) + + def test_deepcopy(self): + assert self.tree1 == copy.deepcopy(self.tree1) + + def test_pickle(self): + s = copy.deepcopy(self.tree1) + data = pickle.dumps(s) + assert pickle.loads(data) == s + + +if __name__ == '__main__': + unittest.main() + diff --git a/lark/tree.py b/lark/tree.py index 5f91605..d00ac2f 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -33,6 +33,19 @@ class Tree(object): def __eq__(self, other): return self.data == other.data and self.children == other.children + def find_pred(self, pred): + if pred(self): + yield self + else: + for i, c in enumerate(self.children): + if isinstance(c, Tree): + for t in c.find_pred(pred): + yield t + + def find_data(self, data): + return self.find_pred(lambda t: t.data == data) + + # def find_path(self, pred): # if pred(self): # yield []