Переглянути джерело

Refactored TokenDef to store Pattern(Str/RE)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 роки тому
джерело
коміт
5236e4a32a
4 змінених файлів з 94 додано та 84 видалено
  1. +32
    -0
      lark/common.py
  2. +13
    -34
      lark/lexer.py
  3. +40
    -50
      lark/load_grammar.py
  4. +9
    -0
      tests/test_parser.py

+ 32
- 0
lark/common.py Переглянути файл

@@ -1,3 +1,4 @@
import re

class GrammarError(Exception):
pass
@@ -43,3 +44,34 @@ class ParserConf:
self.rules = rules
self.callback = callback
self.start = start



class Pattern(object):
def __init__(self, value):
self.value = value

def __repr__(self):
return repr(self.value)

class PatternStr(Pattern):
def to_regexp(self):
return re.escape(self.value)

priority = 0

class PatternRE(Pattern):
def to_regexp(self):
return self.value

priority = 1

class TokenDef(object):
def __init__(self, name, pattern):
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern

def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)


+ 13
- 34
lark/lexer.py Переглянути файл

@@ -3,33 +3,11 @@
import re

from .utils import Str, classify, STRING_TYPE
from .common import is_terminal
from .common import is_terminal, PatternStr, PatternRE, TokenDef

class LexError(Exception):
pass

class TokenDef(object):
def __init__(self, name, value):
assert isinstance(value, STRING_TYPE), value
self.name = name
self.value = value

def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.value)

class TokenDef__Str(TokenDef):
def to_regexp(self):
return re.escape(self.value)

priority = 0

class TokenDef__Regexp(TokenDef):
def to_regexp(self):
return self.value

priority = 1


class UnexpectedInput(LexError):
def __init__(self, seq, lex_pos, line, column):
context = seq[lex_pos:lex_pos+5]
@@ -75,17 +53,18 @@ def _create_unless_callback(strs):
return unless_callback

def _create_unless(tokens):
tokens_by_type = classify(tokens, type)
tokens_by_type = classify(tokens, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set()
callback = {}
for retok in tokens_by_type.get(TokenDef__Regexp, []):
for retok in tokens_by_type.get(PatternRE, []):
unless = {}
for strtok in tokens_by_type.get(TokenDef__Str, []):
m = re.match(retok.value, strtok.value)
if m and m.group(0) == strtok.value:
for strtok in tokens_by_type.get(PatternStr, []):
s = strtok.pattern.value
m = re.match(retok.pattern.value, s)
if m and m.group(0) == s:
embedded_strs.add(strtok.name)
unless[strtok.value] = strtok.name
unless[s] = strtok.name
if unless:
callback[retok.name] = _create_unless_callback(unless)

@@ -104,21 +83,21 @@ class Lexer(object):
# Sanitization
for t in tokens:
try:
re.compile(t.to_regexp())
re.compile(t.pattern.to_regexp())
except:
raise LexError("Cannot compile token: %s: %s" % t)
raise LexError("Cannot compile token: %s: %s" % (t.name, t.pattern))

token_names = {t.name for t in tokens}
assert all(t in token_names for t in ignore)

# Init
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.to_regexp())]
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = [t for t in ignore]

tokens, self.callback = _create_unless(tokens)
assert all(self.callback.values())

tokens.sort(key=lambda x:(x.priority, len(x.value)), reverse=True)
tokens.sort(key=lambda x:(x.pattern.priority, len(x.pattern.value)), reverse=True)

self.tokens = tokens

@@ -132,7 +111,7 @@ class Lexer(object):
mres = []
while tokens:
try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.to_regexp()) for t in tokens[:max_size]))
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()) for t in tokens[:max_size]))
except AssertionError: # Yes, this is what Python provides us.. :/
return self._build_mres(tokens, max_size//2)



+ 40
- 50
lark/load_grammar.py Переглянути файл

@@ -3,12 +3,12 @@ from itertools import chain
import re
import codecs

from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regexp
from .lexer import Lexer, Token, UnexpectedInput

from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR
from .parsers.lalr_parser import UnexpectedToken
from .common import is_terminal, GrammarError, LexerConf, ParserConf
from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef

from .tree import Tree as T, Transformer, InlineTransformer, Visitor

@@ -232,18 +232,19 @@ class SimplifyTree(InlineTransformer):
return tokenmods + [value]

class ExtractAnonTokens(InlineTransformer):
def __init__(self, tokens, token_set):
def __init__(self, tokens):
self.tokens = tokens
self.token_set = token_set
self.token_reverse = {td.value: td.name for td in tokens}
self.token_set = {td.name for td in self.tokens}
self.str_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternStr)}
self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
self.i = 0

def tokenvalue(self, token):
value = token.value[1:-1]
if token.type == 'STRING':
value = token.value[1:-1]
try:
# If already defined, use the user-defined token name
token_name = self.token_reverse[value]
token_name = self.str_reverse[value]
except KeyError:
# Try to assign an indicative anon-token name, otherwise use a numbered name
try:
@@ -257,40 +258,32 @@ class ExtractAnonTokens(InlineTransformer):
token_name = '__' + token_name

elif token.type == 'REGEXP':
token_name = 'ANONRE_%d' % self.i
value = token.value
self.i += 1
if value in self.re_reverse: # Kind of a wierd placement
token_name = self.re_reverse[value]
else:
token_name = 'ANONRE_%d' % self.i
self.i += 1
else:
assert False, token

if value in self.token_reverse: # Kind of a wierd placement
token_name = self.token_reverse[value]

if token_name not in self.token_set:
self.token_set.add(token_name)

if token.type == 'STRING':
self.tokens.append(TokenDef__Str(token_name, token[1:-1]))
pattern = PatternStr(value)
assert value not in self.str_reverse
self.str_reverse[value] = token_name
else:
self.tokens.append(TokenDef__Regexp(token_name, token[1:-1]))
assert value not in self.token_reverse, value
self.token_reverse[value] = token_name

return Token('TOKEN', token_name, -1)
pattern = PatternRE(value)
assert value not in self.re_reverse
self.re_reverse[value] = token_name

self.tokens.append(TokenDef(token_name, pattern))

class TokenValue(object):
def __init__(self, value):
self.value = value

class TokenValue__Str(TokenValue):
def to_regexp(self):
return re.escape(self.value)
return Token('TOKEN', token_name, -1)

class TokenValue__Regexp(TokenValue):
def to_regexp(self):
return self.value

class TokenTreeToRegexp(Transformer):
class TokenTreeToPattern(Transformer):
def tokenvalue(self, tv):
tv ,= tv
value = tv.value[1:-1]
@@ -300,30 +293,30 @@ class TokenTreeToRegexp(Transformer):
value = unicode_escape(value)[0]

if tv.type == 'REGEXP':
return TokenValue__Regexp(value)
return PatternRE(value)
elif tv.type == 'STRING':
return TokenValue__Str(value)
return PatternStr(value)

assert False

def expansion(self, items):
if len(items) == 1:
return items[0]
return TokenValue__Regexp(''.join(i.to_regexp() for i in items))
return PatternRE(''.join(i.to_regexp() for i in items))
def expansions(self, exps):
if len(exps) == 1:
return exps[0]
return TokenValue__Regexp('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)))
return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)))
def range(self, items):
assert all(i.type=='STRING' for i in items)
items = [i[1:-1] for i in items]
start, end = items
assert len(start) == len(end) == 1, (start, end)
return TokenValue__Regexp('[%s-%s]' % (start, end))
return PatternRE('[%s-%s]' % (start, end))

def expr(self, args):
inner, op = args
return TokenValue__Regexp('(?:%s)%s' % (inner.to_regexp(), op))
return PatternRE('(?:%s)%s' % (inner.to_regexp(), op))

class Grammar:
def __init__(self, rule_defs, token_defs, extra):
@@ -339,32 +332,28 @@ class Grammar:
# =================
# Compile Tokens
# =================
token_to_regexp = TokenTreeToRegexp()
token_tree_to_pattern = TokenTreeToPattern()

# Convert tokens to strings/regexps
tokens = []
for name, token_tree in tokendefs:
regexp = token_to_regexp.transform(token_tree)
if isinstance(regexp, TokenValue__Str):
tokendef = TokenDef__Str(name, regexp.value)
else:
tokendef = TokenDef__Regexp(name, regexp.to_regexp())
tokens.append(tokendef)
pattern = token_tree_to_pattern.transform(token_tree)
tokens.append(TokenDef(name, pattern) )

# Resolve regexp assignments of the form /..${X}../
# XXX This is deprecated, since you can express most regexps with EBNF
# XXX Also, since this happens after import, it can be a source of bugs
token_dict = {td.name: td.to_regexp() for td in tokens}
token_dict = {td.name: td.pattern.to_regexp() for td in tokens}
while True:
changed = False
for t in tokens:
if isinstance(t, TokenDef__Regexp):
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], t.value)
if isinstance(t.pattern, PatternRE):
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], t.pattern.value)
if sp:
value = ''.join(token_dict[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
for x in sp)
if value != t.value:
t.value = value
if value != t.pattern.value:
t.pattern.value = value
changed = True
if not changed:
break
@@ -372,7 +361,7 @@ class Grammar:
# =================
# Compile Rules
# =================
extract_anon = ExtractAnonTokens(tokens, set(token_dict))
extract_anon = ExtractAnonTokens(tokens)
ebnf_to_bnf = EBNF_to_BNF()
simplify_rule = SimplifyRule_Visitor()
rule_tree_to_text = RuleTreeToText()
@@ -439,7 +428,7 @@ def resolve_token_references(token_defs):

class GrammarLoader:
def __init__(self):
tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()]
tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]

d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
@@ -493,6 +482,7 @@ class GrammarLoader:
for name, _ in token_defs:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)

# Handle ignore tokens
ignore_names = []
for i, t in enumerate(ignore):


+ 9
- 0
tests/test_parser.py Переглянути файл

@@ -317,15 +317,24 @@ def _make_parser_test(PARSER):
That means that "a" is not filtered out, despite being an 'immediate string'.
Whether or not this is the intuitive behavior, I'm not sure yet.

Perhaps the right thing to do is report a collision (if such is relevant)

-Erez
"""

g = _Lark("""start: "a"
A: "a" """)
x = g.parse('a')

self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous')
self.assertEqual(x.children[0].type, "A")

g = _Lark("""start: /a/
A: /a/ """)
x = g.parse('a')
self.assertEqual(len(x.children), 1, '/a/ should not be considered anonymous')
self.assertEqual(x.children[0].type, "A")

def test_maybe(self):
g = _Lark("""start: ["a"] """)
x = g.parse('a')


Завантаження…
Відмінити
Зберегти