소스 검색

Added an automatic 'unless' mechanism for (some) colliding tokens

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 년 전
부모
커밋
8b182b37c0
3개의 변경된 파일44개의 추가작업 그리고 11개의 파일을 삭제
  1. +21
    -3
      lark/lark.py
  2. +22
    -7
      lark/load_grammar.py
  3. +1
    -1
      lark/tests/test_parser.py

+ 21
- 3
lark/lark.py 파일 보기

@@ -5,6 +5,7 @@ import os
from .utils import STRING_TYPE, inline_args
from .load_grammar import load_grammar
from .tree import Tree, Transformer
from .common import GrammarError

from .lexer import Lexer
from .parse_tree_builder import ParseTreeBuilder
@@ -89,14 +90,31 @@ class Lark:
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class)
self.parser = self._build_parser()

def _create_unless_callback(self, strs):
def f(t):
if t in strs:
t.type = strs[t]
return t
return f

def _build_lexer(self):
ignore_tokens = []
tokens = []
callbacks = {}
for name, value, flags in self.tokens:
if 'ignore' in flags:
ignore_tokens.append(name)
for flag in flags:
if flag == 'ignore':
ignore_tokens.append(name)
elif flag == 'newline':
pass # TODO
elif isinstance(flag, tuple) and flag[0] == 'unless':
_, strs = flag
callbacks[name] = self._create_unless_callback(strs)
else:
raise GrammarError("No such flag: %s" % flag)

tokens.append((name, value))
return Lexer(tokens, {}, ignore=ignore_tokens)
return Lexer(tokens, callbacks, ignore=ignore_tokens)


def _build_parser(self):


+ 22
- 7
lark/load_grammar.py 파일 보기

@@ -291,20 +291,22 @@ class GrammarLoader:
extract_anon = ExtractAnonTokens(tokens, token_set)
tree = extract_anon.transform(tree) # Adds to tokens

token_ref = {}
re_tokens = []
str_tokens = []
tokens2 = []
for name, token, flags in tokens:
value = token.value[1:-1]
if '\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
value = unicode_escape(value)[0]
tokens2.append((name, token.type, value, flags))

if token.type == 'STRING':
value = re.escape(value)
token_ref = {}
re_tokens = []
str_tokens = []
for name, type_, value, flags in tokens2:
if type_ == 'STRING':
str_tokens.append((name, value, flags))
else:
assert token.type == 'REGEXP'
assert type_ == 'REGEXP'
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
if sp:
value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
@@ -313,6 +315,20 @@ class GrammarLoader:
re_tokens.append((name, value, flags))
token_ref[name] = value

embedded_strs = set()
for re_name, re_value, re_flags in re_tokens:
unless = {}
for str_name, str_value, _sf in str_tokens:
m = re.match(re_value, str_value)
if m and m.group(0) == str_value:
embedded_strs.add(str_name)
assert not _sf, "You just broke Lark! Please email me with your grammar"
unless[str_value] = str_name
if unless:
re_flags.append(('unless', unless))

str_tokens = [(n, re.escape(v), f) for n, v, f in str_tokens if name not in embedded_strs]

str_tokens.sort(key=lambda x:len(x[1]), reverse=True)
re_tokens.sort(key=lambda x:len(x[1]), reverse=True)
tokens = str_tokens + re_tokens # Order is important!
@@ -339,7 +355,6 @@ class GrammarLoader:

rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}

# ====================
# Verify correctness
# ====================


+ 1
- 1
lark/tests/test_parser.py 파일 보기

@@ -267,7 +267,7 @@ class TestLalr(unittest.TestCase):
""", parser='lalr')
x = g.parse('Hello World')
self.assertSequenceEqual(x.children, ['World'])
x = g.parse('HelloWorld')
x = g.parse('Hello HelloWorld')
self.assertSequenceEqual(x.children, ['HelloWorld'])

def test_undefined_rule(self):


불러오는 중...
취소
저장