Merge branch 'recons_unicode_terminals' of https://github.com/julienmalard/lark into julienmalard-recons_unicode_terminals

4 years ago · 423cde3da3
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -7,7 +7,7 @@ from io import open
 import pkgutil
 from ast import literal_eval

 from .utils import bfs, Py36, logger, classify_bool
 from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start
 from .lexer import Token, TerminalDef, PatternStr, PatternRE

 from .parse_tree_builder import ParseTreeBuilder
@@ -332,7 +332,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
                try:
                    term_name = _TERMINAL_NAMES[value]
                except KeyError:
                    if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
                    if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set:
                        with suppress(UnicodeEncodeError):
                            value.upper().encode('ascii')  # Make sure we don't have unicode in our terminal names
                            term_name = value.upper()
--- a/lark/reconstruct.py
+++ b/lark/reconstruct.py
@@ -8,6 +8,7 @@ from .lexer import Token, PatternStr
 from .grammar import Terminal, NonTerminal

 from .tree_matcher import TreeMatcher, is_discarded_terminal
 from .utils import is_id_continue

 def is_iter_empty(i):
    try:
@@ -56,10 +57,6 @@ class WriteTokensTransformer(Transformer_InPlace):
        return to_write


 def _isalnum(x):
    # Categories defined here: https://www.python.org/dev/peps/pep-3131/
    return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']

 class Reconstructor(TreeMatcher):
    """
    A Reconstructor that will, given a full parse Tree, generate source code.
@@ -97,7 +94,7 @@ class Reconstructor(TreeMatcher):
        y = []
        prev_item = ''
        for item in x:
            if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]):
            if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]):
                y.append(' ')
            y.append(item)
            prev_item = item
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -1,3 +1,4 @@
 import unicodedata
 import os
 from functools import reduce
 from collections import deque
@@ -15,6 +16,7 @@ Py36 = (sys.version_info[:2] >= (3, 6))

 NO_VALUE = object()


 def classify(seq, key=None, value=None):
    d = {}
    for item in seq:
@@ -169,6 +171,29 @@ def get_regexp_width(expr):
 ###}


 _ID_START =    'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'
 _ID_CONTINUE = _ID_START + ('Nd', 'Nl',)

 def _test_unicode_category(s, categories):
    if len(s) != 1:
        return all(_test_unicode_category(char, categories) for char in s)
    return s == '_' or unicodedata.category(s) in categories

 def is_id_continue(s):
    """
    Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin
    numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details.
    """
    return _test_unicode_category(s, _ID_CONTINUE)

 def is_id_start(s):
    """
    Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin
    numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details.
    """
    return _test_unicode_category(s, _ID_START)


 def dedup_list(l):
    """Given a list (l) will removing duplicates from the list,
       preserving the original order of the list. Assumes that
--- a/tests/test_nearley/nearley
+++ b/tests/test_nearley/nearley
@@ -1 +1 @@
 Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44
 Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de
--- a/tests/test_reconstructor.py
+++ b/tests/test_reconstructor.py
@@ -1,17 +1,22 @@
 # coding=utf-8

 import json
 import sys
 import unittest
 from unittest import TestCase

 from lark import Lark
 from lark.reconstruct import Reconstructor


 common = """
 %import common (WS_INLINE, NUMBER, WORD)
 %ignore WS_INLINE
 """


 def _remove_ws(s):
    return s.replace(' ', '').replace('\n','')
    return s.replace(' ', '').replace('\n', '')


 class TestReconstructor(TestCase):

@@ -22,7 +27,6 @@ class TestReconstructor(TestCase):
        self.assertEqual(_remove_ws(code), _remove_ws(new))

    def test_starred_rule(self):

        g = """
        start: item*
        item: NL
@@ -38,7 +42,6 @@ class TestReconstructor(TestCase):
        self.assert_reconstruct(g, code)

    def test_starred_group(self):

        g = """
        start: (rule | NL)*
        rule: WORD ":" NUMBER
@@ -52,7 +55,6 @@ class TestReconstructor(TestCase):
        self.assert_reconstruct(g, code)

    def test_alias(self):

        g = """
        start: line*
        line: NL
@@ -140,6 +142,43 @@ class TestReconstructor(TestCase):
        new_json = Reconstructor(json_parser).reconstruct(tree)
        self.assertEqual(json.loads(new_json), json.loads(test_json))

    @unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.")
    def test_switch_grammar_unicode_terminal(self):
        """
        This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed
        with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode
        keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON
        tokens (e.g., `+=`) to mis-match between the two grammars.
        """

        g1 = """
        start: (NL | stmt)*
        stmt: "keyword" var op var
        !op: ("+=" | "-=" | "*=" | "/=")
        var: WORD
        NL: /(\\r?\\n)+\s*/
        """ + common

        g2 = """
        start: (NL | stmt)*
        stmt: "குறிப்பு" var op var
        !op: ("+=" | "-=" | "*=" | "/=")
        var: WORD
        NL: /(\\r?\\n)+\s*/
        """ + common

        code = """
        keyword x += y
        """

        l1 = Lark(g1, parser='lalr')
        l2 = Lark(g2, parser='lalr')
        r = Reconstructor(l2)

        tree = l1.parse(code)
        code2 = r.reconstruct(tree)
        assert l2.parse(code2) == tree


 if __name__ == '__main__':
    unittest.main()