From b01c283d47d7194959d057bbaf18a25640acc92a Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 10 Nov 2020 10:33:53 -0500 Subject: [PATCH] Failing test --- tests/test_reconstructor.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py index 93c64fe..6196f4a 100644 --- a/tests/test_reconstructor.py +++ b/tests/test_reconstructor.py @@ -140,6 +140,43 @@ class TestReconstructor(TestCase): new_json = Reconstructor(json_parser).reconstruct(tree) self.assertEqual(json.loads(new_json), json.loads(test_json)) + def test_switch_grammar_unicode_terminal(self): + """ + This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed + with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode + keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON + tokens (e.g., `+=`) to mis-match between the two grammars. + """ + + g1 = """ + start: (NL | stmt)* + stmt: "keyword" var op var + !op: ("+=" | "-=" | "*=" | "/=") + var: WORD + NL: /(\\r?\\n)+\s*/ + """ + common + + g2 = """ + start: (NL | stmt)* + stmt: "குறிப்பு" var op var + !op: ("+=" | "-=" | "*=" | "/=") + var: WORD + NL: /(\\r?\\n)+\s*/ + """ + common + + code = """ + keyword x += y + """ + + l1 = Lark(g1, parser='lalr') + l2 = Lark(g2, parser='lalr') + r = Reconstructor(l2) + + tree = l1.parse(code) + code2 = r.reconstruct(tree) + assert l2.parse(code2) == tree + + if __name__ == '__main__': unittest.main()