Procházet zdrojové kódy

A little bit of cleanup

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan před 7 roky
rodič
revize
680449fb6b
4 změnil soubory, kde provedl 31 přidání a 83 odebrání
  1. +6
    -0
      lark/common.py
  2. +2
    -7
      lark/lexer.py
  3. +20
    -26
      lark/load_grammar.py
  4. +3
    -50
      lark/parser_frontends.py

+ 6
- 0
lark/common.py Zobrazit soubor

@@ -54,6 +54,12 @@ class Pattern(object):
def __repr__(self):
return repr(self.value)

# Pattern Hashing assumes all subclasses have a different priority!
def __hash__(self):
return hash((self.priority, self.value))
def __eq__(self, other):
return self.priority == other.priority and self.value == other.value

class PatternStr(Pattern):
def to_regexp(self):
return re.escape(self.value)


+ 2
- 7
lark/lexer.py Zobrazit soubor

@@ -32,10 +32,7 @@ class Token(Str):

@classmethod
def new_borrow_pos(cls, type_, value, borrow_t):
inst = cls(type_, value, borrow_t.pos_in_stream)
inst.line = borrow_t.line
inst.column = borrow_t.column
return inst
return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)

def __repr__(self):
return 'Token(%s, %r)' % (self.type, self.value)
@@ -176,9 +173,7 @@ class ContextualLexer:
try:
lexer = lexer_by_tokens[key]
except KeyError:
accepts = set(accepts) # For python3
accepts |= set(ignore)
accepts |= set(always_accept)
accepts = set(accepts) | set(ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
lexer = Lexer(state_tokens, ignore=ignore)
lexer_by_tokens[key] = lexer


+ 20
- 26
lark/load_grammar.py Zobrazit soubor

@@ -234,8 +234,7 @@ class ExtractAnonTokens(InlineTransformer):
def __init__(self, tokens):
self.tokens = tokens
self.token_set = {td.name for td in self.tokens}
self.str_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternStr)}
self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
self.token_reverse = {td.pattern: td.name for td in tokens}
self.i = 0


@@ -244,7 +243,7 @@ class ExtractAnonTokens(InlineTransformer):
if isinstance(p, PatternStr):
try:
# If already defined, use the user-defined token name
token_name = self.str_reverse[value]
token_name = self.token_reverse[p]
except KeyError:
# Try to assign an indicative anon-token name, otherwise use a numbered name
try:
@@ -264,8 +263,8 @@ class ExtractAnonTokens(InlineTransformer):
token_name = '__' + token_name

elif isinstance(p, PatternRE):
if value in self.re_reverse: # Kind of a wierd placement
token_name = self.re_reverse[value]
if p in self.token_reverse: # Kind of a wierd placement
token_name = self.token_reverse[p]
else:
token_name = 'ANONRE_%d' % self.i
self.i += 1
@@ -273,15 +272,9 @@ class ExtractAnonTokens(InlineTransformer):
assert False, p

if token_name not in self.token_set:
assert p not in self.token_reverse
self.token_set.add(token_name)

if isinstance(p, PatternStr):
assert value not in self.str_reverse
self.str_reverse[value] = token_name
else:
assert value not in self.re_reverse
self.re_reverse[value] = token_name

self.token_reverse[p] = token_name
self.tokens.append(TokenDef(token_name, p))

return Token('TOKEN', token_name, -1)
@@ -323,6 +316,7 @@ class TokenTreeToPattern(Transformer):
if len(items) == 1:
return items[0]
return PatternRE(''.join(i.to_regexp() for i in items))

def expansions(self, exps):
if len(exps) == 1:
return exps[0]
@@ -361,7 +355,7 @@ class Grammar:
for r, tree, _o in rule_defs:
for exp in tree.find_data('expansion'):
exp.children = list(interleave(exp.children, expr))
if r == start: # TODO use GrammarRule or similar (RuleOptions?)
if r == start:
exp.children = [expr] + exp.children

x = [T('expansion', [Token('RULE', x)]) for x in ignore_names]
@@ -369,9 +363,7 @@ class Grammar:
rule_defs.append(('__ignore', _ignore_tree, None))
# End of "ignore" section

for name, tree in self.token_defs:
rule_defs.append((name, tree, RuleOptions(keep_all_tokens=True)))

rule_defs += [(name, tree, RuleOptions(keep_all_tokens=True)) for name, tree in self.token_defs]
token_defs = []

tokens_to_convert = {name: '__token_'+name for name, tree, _ in rule_defs if is_terminal(name)}
@@ -382,12 +374,13 @@ class Grammar:
options = RuleOptions.new_from(options, filter_out=True)
else:
options = RuleOptions.new_from(options, create_token=name)
name = tokens_to_convert[name]

name = tokens_to_convert.get(name, name)
for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ):
for i, sym in enumerate(exp.children):
if sym in tokens_to_convert:
exp.children[i] = Token(sym.type, tokens_to_convert[sym])

new_rule_defs.append((name, tree, options))

rule_defs = new_rule_defs
@@ -470,13 +463,14 @@ class RuleOptions:
expand1=options and options.expand1,
**kw)

def _extract_options_for_rule(name, expansions):
keep_all_tokens = name.startswith('!')
name = name.lstrip('!')
expand1 = name.startswith('?')
name = name.lstrip('?')
@classmethod
def from_rule(cls, name, expansions):
keep_all_tokens = name.startswith('!')
name = name.lstrip('!')
expand1 = name.startswith('?')
name = name.lstrip('?')

return name, expansions, RuleOptions(keep_all_tokens, expand1)
return name, expansions, cls(keep_all_tokens, expand1)



@@ -515,7 +509,7 @@ class GrammarLoader:
def __init__(self):
tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]

rules = [_extract_options_for_rule(name, x) for name, x in RULES.items()]
rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()]
d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None)
@@ -595,7 +589,7 @@ class GrammarLoader:
raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name)

rules = [_extract_options_for_rule(name, x) for name, x in rule_defs]
rules = [RuleOptions.from_rule(name, x) for name, x in rule_defs]

rule_names = set()
for name, _x, _o in rules:


+ 3
- 50
lark/parser_frontends.py Zobrazit soubor

@@ -38,10 +38,8 @@ class LALR_ContextualLexer:
self.parser = lalr_parser.Parser(parser_conf)

d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()}
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore,
always_accept=lexer_conf.postlex.always_accept
if lexer_conf.postlex else ())

always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)

def parse(self, text):
tokens = self.lexer.lex(text)
@@ -76,8 +74,7 @@ class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf):
WithLexer.__init__(self, lexer_conf)

rules = [(n, self._prepare_expansion(x), a)
for n,x,a in parser_conf.rules]
rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules]

self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))

@@ -102,50 +99,6 @@ def tokenize_text(text):
new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos))
return new_text

class Nearley_NoLex:
def __init__(self, lexer_conf, parser_conf):
self.tokens_to_convert = {name: '__token_'+name for name, tree, _ in parser_conf.rules if is_terminal(name)}
rules = []
for name, exp, alias in parser_conf.rules:
name = self.tokens_to_convert.get(name, name)
exp = [self.tokens_to_convert.get(x, x) for x in exp]
rules.append((name, exp, alias))

self.token_by_name = {t.name:t for t in lexer_conf.tokens}

rules = [{'name':n,
'symbols': list(self._prepare_expansion(x)),
'postprocess': getattr(parser_conf.callback, a)}
for n,x,a in rules]

self.parser = nearley.Parser(rules, parser_conf.start)

def _prepare_expansion(self, expansion):
for sym in expansion:
if is_terminal(sym):
regexp = self.token_by_name[sym].pattern.to_regexp()
width = sre_parse.parse(regexp).getwidth()
if width != (1,1):
raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width))
yield sym, re.compile(regexp)
else:
yield sym

def parse(self, text):
new_text = tokenize_text(text)
res = self.parser.parse(new_text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
res = res[0]

class RestoreTokens(Transformer):
pass

for t in self.tokens_to_convert:
setattr(RestoreTokens, t, ''.join)

res = RestoreTokens().transform(res)
return res


class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf):


Načítá se…
Zrušit
Uložit