Browse Source

Refactored TokenDef to store Pattern(Str/RE)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 8 years ago
parent
commit
5236e4a32a
4 changed files with 94 additions and 84 deletions
  1. +32
    -0
      lark/common.py
  2. +13
    -34
      lark/lexer.py
  3. +40
    -50
      lark/load_grammar.py
  4. +9
    -0
      tests/test_parser.py

+ 32
- 0
lark/common.py View File

@@ -1,3 +1,4 @@
import re


class GrammarError(Exception): class GrammarError(Exception):
pass pass
@@ -43,3 +44,34 @@ class ParserConf:
self.rules = rules self.rules = rules
self.callback = callback self.callback = callback
self.start = start self.start = start



class Pattern(object):
def __init__(self, value):
self.value = value

def __repr__(self):
return repr(self.value)

class PatternStr(Pattern):
def to_regexp(self):
return re.escape(self.value)

priority = 0

class PatternRE(Pattern):
def to_regexp(self):
return self.value

priority = 1

class TokenDef(object):
def __init__(self, name, pattern):
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern

def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)


+ 13
- 34
lark/lexer.py View File

@@ -3,33 +3,11 @@
import re import re


from .utils import Str, classify, STRING_TYPE from .utils import Str, classify, STRING_TYPE
from .common import is_terminal
from .common import is_terminal, PatternStr, PatternRE, TokenDef


class LexError(Exception): class LexError(Exception):
pass pass


class TokenDef(object):
def __init__(self, name, value):
assert isinstance(value, STRING_TYPE), value
self.name = name
self.value = value

def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.value)

class TokenDef__Str(TokenDef):
def to_regexp(self):
return re.escape(self.value)

priority = 0

class TokenDef__Regexp(TokenDef):
def to_regexp(self):
return self.value

priority = 1


class UnexpectedInput(LexError): class UnexpectedInput(LexError):
def __init__(self, seq, lex_pos, line, column): def __init__(self, seq, lex_pos, line, column):
context = seq[lex_pos:lex_pos+5] context = seq[lex_pos:lex_pos+5]
@@ -75,17 +53,18 @@ def _create_unless_callback(strs):
return unless_callback return unless_callback


def _create_unless(tokens): def _create_unless(tokens):
tokens_by_type = classify(tokens, type)
tokens_by_type = classify(tokens, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys() assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set() embedded_strs = set()
callback = {} callback = {}
for retok in tokens_by_type.get(TokenDef__Regexp, []):
for retok in tokens_by_type.get(PatternRE, []):
unless = {} unless = {}
for strtok in tokens_by_type.get(TokenDef__Str, []):
m = re.match(retok.value, strtok.value)
if m and m.group(0) == strtok.value:
for strtok in tokens_by_type.get(PatternStr, []):
s = strtok.pattern.value
m = re.match(retok.pattern.value, s)
if m and m.group(0) == s:
embedded_strs.add(strtok.name) embedded_strs.add(strtok.name)
unless[strtok.value] = strtok.name
unless[s] = strtok.name
if unless: if unless:
callback[retok.name] = _create_unless_callback(unless) callback[retok.name] = _create_unless_callback(unless)


@@ -104,21 +83,21 @@ class Lexer(object):
# Sanitization # Sanitization
for t in tokens: for t in tokens:
try: try:
re.compile(t.to_regexp())
re.compile(t.pattern.to_regexp())
except: except:
raise LexError("Cannot compile token: %s: %s" % t)
raise LexError("Cannot compile token: %s: %s" % (t.name, t.pattern))


token_names = {t.name for t in tokens} token_names = {t.name for t in tokens}
assert all(t in token_names for t in ignore) assert all(t in token_names for t in ignore)


# Init # Init
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.to_regexp())]
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = [t for t in ignore] self.ignore_types = [t for t in ignore]


tokens, self.callback = _create_unless(tokens) tokens, self.callback = _create_unless(tokens)
assert all(self.callback.values()) assert all(self.callback.values())


tokens.sort(key=lambda x:(x.priority, len(x.value)), reverse=True)
tokens.sort(key=lambda x:(x.pattern.priority, len(x.pattern.value)), reverse=True)


self.tokens = tokens self.tokens = tokens


@@ -132,7 +111,7 @@ class Lexer(object):
mres = [] mres = []
while tokens: while tokens:
try: try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.to_regexp()) for t in tokens[:max_size]))
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()) for t in tokens[:max_size]))
except AssertionError: # Yes, this is what Python provides us.. :/ except AssertionError: # Yes, this is what Python provides us.. :/
return self._build_mres(tokens, max_size//2) return self._build_mres(tokens, max_size//2)




+ 40
- 50
lark/load_grammar.py View File

@@ -3,12 +3,12 @@ from itertools import chain
import re import re
import codecs import codecs


from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regexp
from .lexer import Lexer, Token, UnexpectedInput


from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR from .parser_frontends import LALR
from .parsers.lalr_parser import UnexpectedToken from .parsers.lalr_parser import UnexpectedToken
from .common import is_terminal, GrammarError, LexerConf, ParserConf
from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef


from .tree import Tree as T, Transformer, InlineTransformer, Visitor from .tree import Tree as T, Transformer, InlineTransformer, Visitor


@@ -232,18 +232,19 @@ class SimplifyTree(InlineTransformer):
return tokenmods + [value] return tokenmods + [value]


class ExtractAnonTokens(InlineTransformer): class ExtractAnonTokens(InlineTransformer):
def __init__(self, tokens, token_set):
def __init__(self, tokens):
self.tokens = tokens self.tokens = tokens
self.token_set = token_set
self.token_reverse = {td.value: td.name for td in tokens}
self.token_set = {td.name for td in self.tokens}
self.str_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternStr)}
self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
self.i = 0 self.i = 0


def tokenvalue(self, token): def tokenvalue(self, token):
value = token.value[1:-1]
if token.type == 'STRING': if token.type == 'STRING':
value = token.value[1:-1]
try: try:
# If already defined, use the user-defined token name # If already defined, use the user-defined token name
token_name = self.token_reverse[value]
token_name = self.str_reverse[value]
except KeyError: except KeyError:
# Try to assign an indicative anon-token name, otherwise use a numbered name # Try to assign an indicative anon-token name, otherwise use a numbered name
try: try:
@@ -257,40 +258,32 @@ class ExtractAnonTokens(InlineTransformer):
token_name = '__' + token_name token_name = '__' + token_name


elif token.type == 'REGEXP': elif token.type == 'REGEXP':
token_name = 'ANONRE_%d' % self.i
value = token.value
self.i += 1
if value in self.re_reverse: # Kind of a wierd placement
token_name = self.re_reverse[value]
else:
token_name = 'ANONRE_%d' % self.i
self.i += 1
else: else:
assert False, token assert False, token


if value in self.token_reverse: # Kind of a wierd placement
token_name = self.token_reverse[value]

if token_name not in self.token_set: if token_name not in self.token_set:
self.token_set.add(token_name) self.token_set.add(token_name)

if token.type == 'STRING': if token.type == 'STRING':
self.tokens.append(TokenDef__Str(token_name, token[1:-1]))
pattern = PatternStr(value)
assert value not in self.str_reverse
self.str_reverse[value] = token_name
else: else:
self.tokens.append(TokenDef__Regexp(token_name, token[1:-1]))
assert value not in self.token_reverse, value
self.token_reverse[value] = token_name

return Token('TOKEN', token_name, -1)
pattern = PatternRE(value)
assert value not in self.re_reverse
self.re_reverse[value] = token_name


self.tokens.append(TokenDef(token_name, pattern))


class TokenValue(object):
def __init__(self, value):
self.value = value

class TokenValue__Str(TokenValue):
def to_regexp(self):
return re.escape(self.value)
return Token('TOKEN', token_name, -1)


class TokenValue__Regexp(TokenValue):
def to_regexp(self):
return self.value


class TokenTreeToRegexp(Transformer):
class TokenTreeToPattern(Transformer):
def tokenvalue(self, tv): def tokenvalue(self, tv):
tv ,= tv tv ,= tv
value = tv.value[1:-1] value = tv.value[1:-1]
@@ -300,30 +293,30 @@ class TokenTreeToRegexp(Transformer):
value = unicode_escape(value)[0] value = unicode_escape(value)[0]


if tv.type == 'REGEXP': if tv.type == 'REGEXP':
return TokenValue__Regexp(value)
return PatternRE(value)
elif tv.type == 'STRING': elif tv.type == 'STRING':
return TokenValue__Str(value)
return PatternStr(value)


assert False assert False


def expansion(self, items): def expansion(self, items):
if len(items) == 1: if len(items) == 1:
return items[0] return items[0]
return TokenValue__Regexp(''.join(i.to_regexp() for i in items))
return PatternRE(''.join(i.to_regexp() for i in items))
def expansions(self, exps): def expansions(self, exps):
if len(exps) == 1: if len(exps) == 1:
return exps[0] return exps[0]
return TokenValue__Regexp('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)))
return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)))
def range(self, items): def range(self, items):
assert all(i.type=='STRING' for i in items) assert all(i.type=='STRING' for i in items)
items = [i[1:-1] for i in items] items = [i[1:-1] for i in items]
start, end = items start, end = items
assert len(start) == len(end) == 1, (start, end) assert len(start) == len(end) == 1, (start, end)
return TokenValue__Regexp('[%s-%s]' % (start, end))
return PatternRE('[%s-%s]' % (start, end))


def expr(self, args): def expr(self, args):
inner, op = args inner, op = args
return TokenValue__Regexp('(?:%s)%s' % (inner.to_regexp(), op))
return PatternRE('(?:%s)%s' % (inner.to_regexp(), op))


class Grammar: class Grammar:
def __init__(self, rule_defs, token_defs, extra): def __init__(self, rule_defs, token_defs, extra):
@@ -339,32 +332,28 @@ class Grammar:
# ================= # =================
# Compile Tokens # Compile Tokens
# ================= # =================
token_to_regexp = TokenTreeToRegexp()
token_tree_to_pattern = TokenTreeToPattern()


# Convert tokens to strings/regexps # Convert tokens to strings/regexps
tokens = [] tokens = []
for name, token_tree in tokendefs: for name, token_tree in tokendefs:
regexp = token_to_regexp.transform(token_tree)
if isinstance(regexp, TokenValue__Str):
tokendef = TokenDef__Str(name, regexp.value)
else:
tokendef = TokenDef__Regexp(name, regexp.to_regexp())
tokens.append(tokendef)
pattern = token_tree_to_pattern.transform(token_tree)
tokens.append(TokenDef(name, pattern) )


# Resolve regexp assignments of the form /..${X}../ # Resolve regexp assignments of the form /..${X}../
# XXX This is deprecated, since you can express most regexps with EBNF # XXX This is deprecated, since you can express most regexps with EBNF
# XXX Also, since this happens after import, it can be a source of bugs # XXX Also, since this happens after import, it can be a source of bugs
token_dict = {td.name: td.to_regexp() for td in tokens}
token_dict = {td.name: td.pattern.to_regexp() for td in tokens}
while True: while True:
changed = False changed = False
for t in tokens: for t in tokens:
if isinstance(t, TokenDef__Regexp):
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], t.value)
if isinstance(t.pattern, PatternRE):
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], t.pattern.value)
if sp: if sp:
value = ''.join(token_dict[x[2:-1]] if x.startswith('${') and x.endswith('}') else x value = ''.join(token_dict[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
for x in sp) for x in sp)
if value != t.value:
t.value = value
if value != t.pattern.value:
t.pattern.value = value
changed = True changed = True
if not changed: if not changed:
break break
@@ -372,7 +361,7 @@ class Grammar:
# ================= # =================
# Compile Rules # Compile Rules
# ================= # =================
extract_anon = ExtractAnonTokens(tokens, set(token_dict))
extract_anon = ExtractAnonTokens(tokens)
ebnf_to_bnf = EBNF_to_BNF() ebnf_to_bnf = EBNF_to_BNF()
simplify_rule = SimplifyRule_Visitor() simplify_rule = SimplifyRule_Visitor()
rule_tree_to_text = RuleTreeToText() rule_tree_to_text = RuleTreeToText()
@@ -439,7 +428,7 @@ def resolve_token_references(token_defs):


class GrammarLoader: class GrammarLoader:
def __init__(self): def __init__(self):
tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()]
tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]


d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()} d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None) rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
@@ -493,6 +482,7 @@ class GrammarLoader:
for name, _ in token_defs: for name, _ in token_defs:
if name.startswith('__'): if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)

# Handle ignore tokens # Handle ignore tokens
ignore_names = [] ignore_names = []
for i, t in enumerate(ignore): for i, t in enumerate(ignore):


+ 9
- 0
tests/test_parser.py View File

@@ -317,15 +317,24 @@ def _make_parser_test(PARSER):
That means that "a" is not filtered out, despite being an 'immediate string'. That means that "a" is not filtered out, despite being an 'immediate string'.
Whether or not this is the intuitive behavior, I'm not sure yet. Whether or not this is the intuitive behavior, I'm not sure yet.


Perhaps the right thing to do is report a collision (if such is relevant)

-Erez -Erez
""" """


g = _Lark("""start: "a" g = _Lark("""start: "a"
A: "a" """) A: "a" """)
x = g.parse('a') x = g.parse('a')

self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous') self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous')
self.assertEqual(x.children[0].type, "A") self.assertEqual(x.children[0].type, "A")


g = _Lark("""start: /a/
A: /a/ """)
x = g.parse('a')
self.assertEqual(len(x.children), 1, '/a/ should not be considered anonymous')
self.assertEqual(x.children[0].type, "A")

def test_maybe(self): def test_maybe(self):
g = _Lark("""start: ["a"] """) g = _Lark("""start: ["a"] """)
x = g.parse('a') x = g.parse('a')


Loading…
Cancel
Save