Browse Source

Merge pull request #786 from lark-parser/julienmalard-recons_unicode_terminals

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Shinan 4 years ago
committed by GitHub
parent
commit
a201d6ff53
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 74 additions and 15 deletions
  1. +3
    -5
      lark/load_grammar.py
  2. +2
    -5
      lark/reconstruct.py
  3. +25
    -0
      lark/utils.py
  4. +44
    -5
      tests/test_reconstructor.py

+ 3
- 5
lark/load_grammar.py View File

@@ -7,7 +7,7 @@ from io import open
import pkgutil
from ast import literal_eval

from .utils import bfs, Py36, logger, classify_bool
from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start
from .lexer import Token, TerminalDef, PatternStr, PatternRE

from .parse_tree_builder import ParseTreeBuilder
@@ -332,10 +332,8 @@ class PrepareAnonTerminals(Transformer_InPlace):
try:
term_name = _TERMINAL_NAMES[value]
except KeyError:
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
with suppress(UnicodeEncodeError):
value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names
term_name = value.upper()
if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set:
term_name = value.upper()

if term_name in self.term_set:
term_name = None


+ 2
- 5
lark/reconstruct.py View File

@@ -8,6 +8,7 @@ from .lexer import Token, PatternStr
from .grammar import Terminal, NonTerminal

from .tree_matcher import TreeMatcher, is_discarded_terminal
from .utils import is_id_continue

def is_iter_empty(i):
try:
@@ -56,10 +57,6 @@ class WriteTokensTransformer(Transformer_InPlace):
return to_write


def _isalnum(x):
# Categories defined here: https://www.python.org/dev/peps/pep-3131/
return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']

class Reconstructor(TreeMatcher):
"""
A Reconstructor that will, given a full parse Tree, generate source code.
@@ -97,7 +94,7 @@ class Reconstructor(TreeMatcher):
y = []
prev_item = ''
for item in x:
if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]):
if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]):
y.append(' ')
y.append(item)
prev_item = item


+ 25
- 0
lark/utils.py View File

@@ -1,3 +1,4 @@
import unicodedata
import os
from functools import reduce
from collections import deque
@@ -15,6 +16,7 @@ Py36 = (sys.version_info[:2] >= (3, 6))

NO_VALUE = object()


def classify(seq, key=None, value=None):
d = {}
for item in seq:
@@ -169,6 +171,29 @@ def get_regexp_width(expr):
###}


_ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'
_ID_CONTINUE = _ID_START + ('Nd', 'Nl',)

def _test_unicode_category(s, categories):
if len(s) != 1:
return all(_test_unicode_category(char, categories) for char in s)
return s == '_' or unicodedata.category(s) in categories

def is_id_continue(s):
"""
Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin
numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details.
"""
return _test_unicode_category(s, _ID_CONTINUE)

def is_id_start(s):
"""
Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin
numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details.
"""
return _test_unicode_category(s, _ID_START)


def dedup_list(l):
"""Given a list (l) will removing duplicates from the list,
preserving the original order of the list. Assumes that


+ 44
- 5
tests/test_reconstructor.py View File

@@ -1,17 +1,22 @@
# coding=utf-8

import json
import sys
import unittest
from unittest import TestCase

from lark import Lark
from lark.reconstruct import Reconstructor


common = """
%import common (WS_INLINE, NUMBER, WORD)
%ignore WS_INLINE
"""


def _remove_ws(s):
return s.replace(' ', '').replace('\n','')
return s.replace(' ', '').replace('\n', '')


class TestReconstructor(TestCase):

@@ -22,7 +27,6 @@ class TestReconstructor(TestCase):
self.assertEqual(_remove_ws(code), _remove_ws(new))

def test_starred_rule(self):

g = """
start: item*
item: NL
@@ -38,7 +42,6 @@ class TestReconstructor(TestCase):
self.assert_reconstruct(g, code)

def test_starred_group(self):

g = """
start: (rule | NL)*
rule: WORD ":" NUMBER
@@ -52,7 +55,6 @@ class TestReconstructor(TestCase):
self.assert_reconstruct(g, code)

def test_alias(self):

g = """
start: line*
line: NL
@@ -140,6 +142,43 @@ class TestReconstructor(TestCase):
new_json = Reconstructor(json_parser).reconstruct(tree)
self.assertEqual(json.loads(new_json), json.loads(test_json))

@unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.")
def test_switch_grammar_unicode_terminal(self):
"""
This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed
with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode
keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON
tokens (e.g., `+=`) to mis-match between the two grammars.
"""

g1 = """
start: (NL | stmt)*
stmt: "keyword" var op var
!op: ("+=" | "-=" | "*=" | "/=")
var: WORD
NL: /(\\r?\\n)+\s*/
""" + common

g2 = """
start: (NL | stmt)*
stmt: "குறிப்பு" var op var
!op: ("+=" | "-=" | "*=" | "/=")
var: WORD
NL: /(\\r?\\n)+\s*/
""" + common

code = """
keyword x += y
"""

l1 = Lark(g1, parser='lalr')
l2 = Lark(g2, parser='lalr')
r = Reconstructor(l2)

tree = l1.parse(code)
code2 = r.reconstruct(tree)
assert l2.parse(code2) == tree


if __name__ == '__main__':
unittest.main()

Loading…
Cancel
Save