Merge pull request #786 from lark-parser/julienmalard-recons_unicode_terminals

4 years ago · a201d6ff53
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -7,7 +7,7 @@ from io import open
 import pkgutil
 from ast import literal_eval

 from .utils import bfs, Py36, logger, classify_bool
 from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start
 from .lexer import Token, TerminalDef, PatternStr, PatternRE

 from .parse_tree_builder import ParseTreeBuilder
@@ -332,10 +332,8 @@ class PrepareAnonTerminals(Transformer_InPlace):
                try:
                    term_name = _TERMINAL_NAMES[value]
                except KeyError:
                    if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
                        with suppress(UnicodeEncodeError):
                            value.upper().encode('ascii')  # Make sure we don't have unicode in our terminal names
                            term_name = value.upper()
                    if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set:
                        term_name = value.upper()

                if term_name in self.term_set:
                    term_name = None
--- a/lark/reconstruct.py
+++ b/lark/reconstruct.py
@@ -8,6 +8,7 @@ from .lexer import Token, PatternStr
 from .grammar import Terminal, NonTerminal

 from .tree_matcher import TreeMatcher, is_discarded_terminal
 from .utils import is_id_continue

 def is_iter_empty(i):
    try:
@@ -56,10 +57,6 @@ class WriteTokensTransformer(Transformer_InPlace):
        return to_write


 def _isalnum(x):
    # Categories defined here: https://www.python.org/dev/peps/pep-3131/
    return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']

 class Reconstructor(TreeMatcher):
    """
    A Reconstructor that will, given a full parse Tree, generate source code.
@@ -97,7 +94,7 @@ class Reconstructor(TreeMatcher):
        y = []
        prev_item = ''
        for item in x:
            if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]):
            if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]):
                y.append(' ')
            y.append(item)
            prev_item = item
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -1,3 +1,4 @@
 import unicodedata
 import os
 from functools import reduce
 from collections import deque
@@ -15,6 +16,7 @@ Py36 = (sys.version_info[:2] >= (3, 6))

 NO_VALUE = object()


 def classify(seq, key=None, value=None):
    d = {}
    for item in seq:
@@ -169,6 +171,29 @@ def get_regexp_width(expr):
 ###}


 _ID_START =    'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'
 _ID_CONTINUE = _ID_START + ('Nd', 'Nl',)

 def _test_unicode_category(s, categories):
    if len(s) != 1:
        return all(_test_unicode_category(char, categories) for char in s)
    return s == '_' or unicodedata.category(s) in categories

 def is_id_continue(s):
    """
    Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin
    numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details.
    """
    return _test_unicode_category(s, _ID_CONTINUE)

 def is_id_start(s):
    """
    Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin
    numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details.
    """
    return _test_unicode_category(s, _ID_START)


 def dedup_list(l):
    """Given a list (l) will removing duplicates from the list,
       preserving the original order of the list. Assumes that
--- a/tests/test_reconstructor.py
+++ b/tests/test_reconstructor.py
@@ -1,17 +1,22 @@
 # coding=utf-8

 import json
 import sys
 import unittest
 from unittest import TestCase

 from lark import Lark
 from lark.reconstruct import Reconstructor


 common = """
 %import common (WS_INLINE, NUMBER, WORD)
 %ignore WS_INLINE
 """


 def _remove_ws(s):
    return s.replace(' ', '').replace('\n','')
    return s.replace(' ', '').replace('\n', '')


 class TestReconstructor(TestCase):

@@ -22,7 +27,6 @@ class TestReconstructor(TestCase):
        self.assertEqual(_remove_ws(code), _remove_ws(new))

    def test_starred_rule(self):

        g = """
        start: item*
        item: NL
@@ -38,7 +42,6 @@ class TestReconstructor(TestCase):
        self.assert_reconstruct(g, code)

    def test_starred_group(self):

        g = """
        start: (rule | NL)*
        rule: WORD ":" NUMBER
@@ -52,7 +55,6 @@ class TestReconstructor(TestCase):
        self.assert_reconstruct(g, code)

    def test_alias(self):

        g = """
        start: line*
        line: NL
@@ -140,6 +142,43 @@ class TestReconstructor(TestCase):
        new_json = Reconstructor(json_parser).reconstruct(tree)
        self.assertEqual(json.loads(new_json), json.loads(test_json))

    @unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.")
    def test_switch_grammar_unicode_terminal(self):
        """
        This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed
        with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode
        keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON
        tokens (e.g., `+=`) to mis-match between the two grammars.
        """

        g1 = """
        start: (NL | stmt)*
        stmt: "keyword" var op var
        !op: ("+=" | "-=" | "*=" | "/=")
        var: WORD
        NL: /(\\r?\\n)+\s*/
        """ + common

        g2 = """
        start: (NL | stmt)*
        stmt: "குறிப்பு" var op var
        !op: ("+=" | "-=" | "*=" | "/=")
        var: WORD
        NL: /(\\r?\\n)+\s*/
        """ + common

        code = """
        keyword x += y
        """

        l1 = Lark(g1, parser='lalr')
        l2 = Lark(g2, parser='lalr')
        r = Reconstructor(l2)

        tree = l1.parse(code)
        code2 = r.reconstruct(tree)
        assert l2.parse(code2) == tree


 if __name__ == '__main__':
    unittest.main()