# coding=utf-8 import json import sys import unittest from itertools import product from unittest import TestCase from lark import Lark from lark.reconstruct import Reconstructor common = """ %import common (WS_INLINE, NUMBER, WORD) %ignore WS_INLINE """ def _remove_ws(s): return s.replace(' ', '').replace('\n', '') class TestReconstructor(TestCase): def assert_reconstruct(self, grammar, code, **options): parser = Lark(grammar, parser='lalr', maybe_placeholders=False, **options) tree = parser.parse(code) new = Reconstructor(parser).reconstruct(tree) self.assertEqual(_remove_ws(code), _remove_ws(new)) def test_starred_rule(self): g = """ start: item* item: NL | rule rule: WORD ":" NUMBER NL: /(\\r?\\n)+\\s*/ """ + common code = """ Elephants: 12 """ self.assert_reconstruct(g, code) def test_starred_group(self): g = """ start: (rule | NL)* rule: WORD ":" NUMBER NL: /(\\r?\\n)+\\s*/ """ + common code = """ Elephants: 12 """ self.assert_reconstruct(g, code) def test_alias(self): g = """ start: line* line: NL | rule | "hello" -> hi rule: WORD ":" NUMBER NL: /(\\r?\\n)+\\s*/ """ + common code = """ Elephants: 12 hello """ self.assert_reconstruct(g, code) def test_keep_tokens(self): g = """ start: (NL | stmt)* stmt: var op var !op: ("+" | "-" | "*" | "/") var: WORD NL: /(\\r?\\n)+\s*/ """ + common code = """ a+b """ self.assert_reconstruct(g, code) def test_expand_rule(self): g = """ ?start: (NL | mult_stmt)* ?mult_stmt: sum_stmt ["*" sum_stmt] ?sum_stmt: var ["+" var] var: WORD NL: /(\\r?\\n)+\s*/ """ + common code = ['a', 'a*b', 'a+b', 'a*b+c', 'a+b*c', 'a+b*c+d'] for c in code: self.assert_reconstruct(g, c) def test_json_example(self): test_json = ''' { "empty_object" : {}, "empty_array" : [], "booleans" : { "YES" : true, "NO" : false }, "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], "strings" : [ "This", [ "And" , "That", "And a \\"b" ] ], "nothing" : null } ''' json_grammar = r""" ?start: value ?value: object | array | string | SIGNED_NUMBER -> number | "true" -> true | "false" -> false | "null" -> null array : "[" [value ("," value)*] "]" object : "{" [pair ("," pair)*] "}" pair : string ":" value string : ESCAPED_STRING %import common.ESCAPED_STRING %import common.SIGNED_NUMBER %import common.WS %ignore WS """ json_parser = Lark(json_grammar, parser='lalr', maybe_placeholders=False) tree = json_parser.parse(test_json) new_json = Reconstructor(json_parser).reconstruct(tree) self.assertEqual(json.loads(new_json), json.loads(test_json)) def test_keep_all_tokens(self): g = """ start: "a"? _B? c? _d? _B: "b" c: "c" _d: "d" """ examples = list(map(''.join, product(('', 'a'), ('', 'b'), ('', 'c'), ('', 'd'), ))) for code in examples: self.assert_reconstruct(g, code, keep_all_tokens=True) @unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.") def test_switch_grammar_unicode_terminal(self): """ This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON tokens (e.g., `+=`) to mis-match between the two grammars. """ g1 = """ start: (NL | stmt)* stmt: "keyword" var op var !op: ("+=" | "-=" | "*=" | "/=") var: WORD NL: /(\\r?\\n)+\s*/ """ + common g2 = """ start: (NL | stmt)* stmt: "குறிப்பு" var op var !op: ("+=" | "-=" | "*=" | "/=") var: WORD NL: /(\\r?\\n)+\s*/ """ + common code = """ keyword x += y """ l1 = Lark(g1, parser='lalr', maybe_placeholders=False) l2 = Lark(g2, parser='lalr', maybe_placeholders=False) r = Reconstructor(l2) tree = l1.parse(code) code2 = r.reconstruct(tree) assert l2.parse(code2) == tree if __name__ == '__main__': unittest.main()