Browse Source

Merge pull request #786 from lark-parser/julienmalard-recons_unicode_terminals

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Shinan 4 years ago
committed by GitHub
parent
commit
a201d6ff53
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 74 additions and 15 deletions
  1. +3
    -5
      lark/load_grammar.py
  2. +2
    -5
      lark/reconstruct.py
  3. +25
    -0
      lark/utils.py
  4. +44
    -5
      tests/test_reconstructor.py

+ 3
- 5
lark/load_grammar.py View File

@@ -7,7 +7,7 @@ from io import open
import pkgutil import pkgutil
from ast import literal_eval from ast import literal_eval


from .utils import bfs, Py36, logger, classify_bool
from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start
from .lexer import Token, TerminalDef, PatternStr, PatternRE from .lexer import Token, TerminalDef, PatternStr, PatternRE


from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
@@ -332,10 +332,8 @@ class PrepareAnonTerminals(Transformer_InPlace):
try: try:
term_name = _TERMINAL_NAMES[value] term_name = _TERMINAL_NAMES[value]
except KeyError: except KeyError:
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
with suppress(UnicodeEncodeError):
value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names
term_name = value.upper()
if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set:
term_name = value.upper()


if term_name in self.term_set: if term_name in self.term_set:
term_name = None term_name = None


+ 2
- 5
lark/reconstruct.py View File

@@ -8,6 +8,7 @@ from .lexer import Token, PatternStr
from .grammar import Terminal, NonTerminal from .grammar import Terminal, NonTerminal


from .tree_matcher import TreeMatcher, is_discarded_terminal from .tree_matcher import TreeMatcher, is_discarded_terminal
from .utils import is_id_continue


def is_iter_empty(i): def is_iter_empty(i):
try: try:
@@ -56,10 +57,6 @@ class WriteTokensTransformer(Transformer_InPlace):
return to_write return to_write




def _isalnum(x):
# Categories defined here: https://www.python.org/dev/peps/pep-3131/
return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']

class Reconstructor(TreeMatcher): class Reconstructor(TreeMatcher):
""" """
A Reconstructor that will, given a full parse Tree, generate source code. A Reconstructor that will, given a full parse Tree, generate source code.
@@ -97,7 +94,7 @@ class Reconstructor(TreeMatcher):
y = [] y = []
prev_item = '' prev_item = ''
for item in x: for item in x:
if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]):
if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]):
y.append(' ') y.append(' ')
y.append(item) y.append(item)
prev_item = item prev_item = item


+ 25
- 0
lark/utils.py View File

@@ -1,3 +1,4 @@
import unicodedata
import os import os
from functools import reduce from functools import reduce
from collections import deque from collections import deque
@@ -15,6 +16,7 @@ Py36 = (sys.version_info[:2] >= (3, 6))


NO_VALUE = object() NO_VALUE = object()



def classify(seq, key=None, value=None): def classify(seq, key=None, value=None):
d = {} d = {}
for item in seq: for item in seq:
@@ -169,6 +171,29 @@ def get_regexp_width(expr):
###} ###}




_ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'
_ID_CONTINUE = _ID_START + ('Nd', 'Nl',)

def _test_unicode_category(s, categories):
if len(s) != 1:
return all(_test_unicode_category(char, categories) for char in s)
return s == '_' or unicodedata.category(s) in categories

def is_id_continue(s):
"""
Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin
numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details.
"""
return _test_unicode_category(s, _ID_CONTINUE)

def is_id_start(s):
"""
Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin
numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details.
"""
return _test_unicode_category(s, _ID_START)


def dedup_list(l): def dedup_list(l):
"""Given a list (l) will removing duplicates from the list, """Given a list (l) will removing duplicates from the list,
preserving the original order of the list. Assumes that preserving the original order of the list. Assumes that


+ 44
- 5
tests/test_reconstructor.py View File

@@ -1,17 +1,22 @@
# coding=utf-8

import json import json
import sys
import unittest import unittest
from unittest import TestCase from unittest import TestCase

from lark import Lark from lark import Lark
from lark.reconstruct import Reconstructor from lark.reconstruct import Reconstructor



common = """ common = """
%import common (WS_INLINE, NUMBER, WORD) %import common (WS_INLINE, NUMBER, WORD)
%ignore WS_INLINE %ignore WS_INLINE
""" """



def _remove_ws(s): def _remove_ws(s):
return s.replace(' ', '').replace('\n','')
return s.replace(' ', '').replace('\n', '')



class TestReconstructor(TestCase): class TestReconstructor(TestCase):


@@ -22,7 +27,6 @@ class TestReconstructor(TestCase):
self.assertEqual(_remove_ws(code), _remove_ws(new)) self.assertEqual(_remove_ws(code), _remove_ws(new))


def test_starred_rule(self): def test_starred_rule(self):

g = """ g = """
start: item* start: item*
item: NL item: NL
@@ -38,7 +42,6 @@ class TestReconstructor(TestCase):
self.assert_reconstruct(g, code) self.assert_reconstruct(g, code)


def test_starred_group(self): def test_starred_group(self):

g = """ g = """
start: (rule | NL)* start: (rule | NL)*
rule: WORD ":" NUMBER rule: WORD ":" NUMBER
@@ -52,7 +55,6 @@ class TestReconstructor(TestCase):
self.assert_reconstruct(g, code) self.assert_reconstruct(g, code)


def test_alias(self): def test_alias(self):

g = """ g = """
start: line* start: line*
line: NL line: NL
@@ -140,6 +142,43 @@ class TestReconstructor(TestCase):
new_json = Reconstructor(json_parser).reconstruct(tree) new_json = Reconstructor(json_parser).reconstruct(tree)
self.assertEqual(json.loads(new_json), json.loads(test_json)) self.assertEqual(json.loads(new_json), json.loads(test_json))


@unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.")
def test_switch_grammar_unicode_terminal(self):
"""
This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed
with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode
keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON
tokens (e.g., `+=`) to mis-match between the two grammars.
"""

g1 = """
start: (NL | stmt)*
stmt: "keyword" var op var
!op: ("+=" | "-=" | "*=" | "/=")
var: WORD
NL: /(\\r?\\n)+\s*/
""" + common

g2 = """
start: (NL | stmt)*
stmt: "குறிப்பு" var op var
!op: ("+=" | "-=" | "*=" | "/=")
var: WORD
NL: /(\\r?\\n)+\s*/
""" + common

code = """
keyword x += y
"""

l1 = Lark(g1, parser='lalr')
l2 = Lark(g2, parser='lalr')
r = Reconstructor(l2)

tree = l1.parse(code)
code2 = r.reconstruct(tree)
assert l2.parse(code2) == tree



if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

Loading…
Cancel
Save