Pārlūkot izejas kodu

Merge branch 'recons_unicode_terminals' of https://github.com/julienmalard/lark into julienmalard-recons_unicode_terminals

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Sh pirms 3 gadiem
vecāks
revīzija
423cde3da3
5 mainītis faili ar 74 papildinājumiem un 13 dzēšanām
  1. +2
    -2
      lark/load_grammar.py
  2. +2
    -5
      lark/reconstruct.py
  3. +25
    -0
      lark/utils.py
  4. +1
    -1
      tests/test_nearley/nearley
  5. +44
    -5
      tests/test_reconstructor.py

+ 2
- 2
lark/load_grammar.py Parādīt failu

@@ -7,7 +7,7 @@ from io import open
import pkgutil
from ast import literal_eval

from .utils import bfs, Py36, logger, classify_bool
from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start
from .lexer import Token, TerminalDef, PatternStr, PatternRE

from .parse_tree_builder import ParseTreeBuilder
@@ -332,7 +332,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
try:
term_name = _TERMINAL_NAMES[value]
except KeyError:
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set:
with suppress(UnicodeEncodeError):
value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names
term_name = value.upper()


+ 2
- 5
lark/reconstruct.py Parādīt failu

@@ -8,6 +8,7 @@ from .lexer import Token, PatternStr
from .grammar import Terminal, NonTerminal

from .tree_matcher import TreeMatcher, is_discarded_terminal
from .utils import is_id_continue

def is_iter_empty(i):
try:
@@ -56,10 +57,6 @@ class WriteTokensTransformer(Transformer_InPlace):
return to_write


def _isalnum(x):
# Categories defined here: https://www.python.org/dev/peps/pep-3131/
return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']

class Reconstructor(TreeMatcher):
"""
A Reconstructor that will, given a full parse Tree, generate source code.
@@ -97,7 +94,7 @@ class Reconstructor(TreeMatcher):
y = []
prev_item = ''
for item in x:
if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]):
if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]):
y.append(' ')
y.append(item)
prev_item = item


+ 25
- 0
lark/utils.py Parādīt failu

@@ -1,3 +1,4 @@
import unicodedata
import os
from functools import reduce
from collections import deque
@@ -15,6 +16,7 @@ Py36 = (sys.version_info[:2] >= (3, 6))

NO_VALUE = object()


def classify(seq, key=None, value=None):
d = {}
for item in seq:
@@ -169,6 +171,29 @@ def get_regexp_width(expr):
###}


_ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'
_ID_CONTINUE = _ID_START + ('Nd', 'Nl',)

def _test_unicode_category(s, categories):
if len(s) != 1:
return all(_test_unicode_category(char, categories) for char in s)
return s == '_' or unicodedata.category(s) in categories

def is_id_continue(s):
"""
Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin
numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details.
"""
return _test_unicode_category(s, _ID_CONTINUE)

def is_id_start(s):
"""
Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin
numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details.
"""
return _test_unicode_category(s, _ID_START)


def dedup_list(l):
"""Given a list (l) will removing duplicates from the list,
preserving the original order of the list. Assumes that


+ 1
- 1
tests/test_nearley/nearley

@@ -1 +1 @@
Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44
Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de

+ 44
- 5
tests/test_reconstructor.py Parādīt failu

@@ -1,17 +1,22 @@
# coding=utf-8

import json
import sys
import unittest
from unittest import TestCase

from lark import Lark
from lark.reconstruct import Reconstructor


common = """
%import common (WS_INLINE, NUMBER, WORD)
%ignore WS_INLINE
"""


def _remove_ws(s):
return s.replace(' ', '').replace('\n','')
return s.replace(' ', '').replace('\n', '')


class TestReconstructor(TestCase):

@@ -22,7 +27,6 @@ class TestReconstructor(TestCase):
self.assertEqual(_remove_ws(code), _remove_ws(new))

def test_starred_rule(self):

g = """
start: item*
item: NL
@@ -38,7 +42,6 @@ class TestReconstructor(TestCase):
self.assert_reconstruct(g, code)

def test_starred_group(self):

g = """
start: (rule | NL)*
rule: WORD ":" NUMBER
@@ -52,7 +55,6 @@ class TestReconstructor(TestCase):
self.assert_reconstruct(g, code)

def test_alias(self):

g = """
start: line*
line: NL
@@ -140,6 +142,43 @@ class TestReconstructor(TestCase):
new_json = Reconstructor(json_parser).reconstruct(tree)
self.assertEqual(json.loads(new_json), json.loads(test_json))

@unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.")
def test_switch_grammar_unicode_terminal(self):
"""
This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed
with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode
keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON
tokens (e.g., `+=`) to mis-match between the two grammars.
"""

g1 = """
start: (NL | stmt)*
stmt: "keyword" var op var
!op: ("+=" | "-=" | "*=" | "/=")
var: WORD
NL: /(\\r?\\n)+\s*/
""" + common

g2 = """
start: (NL | stmt)*
stmt: "குறிப்பு" var op var
!op: ("+=" | "-=" | "*=" | "/=")
var: WORD
NL: /(\\r?\\n)+\s*/
""" + common

code = """
keyword x += y
"""

l1 = Lark(g1, parser='lalr')
l2 = Lark(g2, parser='lalr')
r = Reconstructor(l2)

tree = l1.parse(code)
code2 = r.reconstruct(tree)
assert l2.parse(code2) == tree


if __name__ == '__main__':
unittest.main()

Notiek ielāde…
Atcelt
Saglabāt