From 680449fb6bc2e850f703f794b0dd6b34a60682d9 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 3 Mar 2017 12:50:05 +0200 Subject: [PATCH] A little bit of cleanup --- lark/common.py | 6 +++++ lark/lexer.py | 9 ++----- lark/load_grammar.py | 46 +++++++++++++++------------------- lark/parser_frontends.py | 53 +++------------------------------------- 4 files changed, 31 insertions(+), 83 deletions(-) diff --git a/lark/common.py b/lark/common.py index fee6d75..f0e1fb6 100644 --- a/lark/common.py +++ b/lark/common.py @@ -54,6 +54,12 @@ class Pattern(object): def __repr__(self): return repr(self.value) + # Pattern Hashing assumes all subclasses have a different priority! + def __hash__(self): + return hash((self.priority, self.value)) + def __eq__(self, other): + return self.priority == other.priority and self.value == other.value + class PatternStr(Pattern): def to_regexp(self): return re.escape(self.value) diff --git a/lark/lexer.py b/lark/lexer.py index 5991bf4..e17895b 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -32,10 +32,7 @@ class Token(Str): @classmethod def new_borrow_pos(cls, type_, value, borrow_t): - inst = cls(type_, value, borrow_t.pos_in_stream) - inst.line = borrow_t.line - inst.column = borrow_t.column - return inst + return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) def __repr__(self): return 'Token(%s, %r)' % (self.type, self.value) @@ -176,9 +173,7 @@ class ContextualLexer: try: lexer = lexer_by_tokens[key] except KeyError: - accepts = set(accepts) # For python3 - accepts |= set(ignore) - accepts |= set(always_accept) + accepts = set(accepts) | set(ignore) | set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] lexer = Lexer(state_tokens, ignore=ignore) lexer_by_tokens[key] = lexer diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 8027857..95f0679 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -234,8 +234,7 @@ class ExtractAnonTokens(InlineTransformer): def __init__(self, tokens): self.tokens = tokens self.token_set = {td.name for td in self.tokens} - self.str_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternStr)} - self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)} + self.token_reverse = {td.pattern: td.name for td in tokens} self.i = 0 @@ -244,7 +243,7 @@ class ExtractAnonTokens(InlineTransformer): if isinstance(p, PatternStr): try: # If already defined, use the user-defined token name - token_name = self.str_reverse[value] + token_name = self.token_reverse[p] except KeyError: # Try to assign an indicative anon-token name, otherwise use a numbered name try: @@ -264,8 +263,8 @@ class ExtractAnonTokens(InlineTransformer): token_name = '__' + token_name elif isinstance(p, PatternRE): - if value in self.re_reverse: # Kind of a wierd placement - token_name = self.re_reverse[value] + if p in self.token_reverse: # Kind of a wierd placement + token_name = self.token_reverse[p] else: token_name = 'ANONRE_%d' % self.i self.i += 1 @@ -273,15 +272,9 @@ class ExtractAnonTokens(InlineTransformer): assert False, p if token_name not in self.token_set: + assert p not in self.token_reverse self.token_set.add(token_name) - - if isinstance(p, PatternStr): - assert value not in self.str_reverse - self.str_reverse[value] = token_name - else: - assert value not in self.re_reverse - self.re_reverse[value] = token_name - + self.token_reverse[p] = token_name self.tokens.append(TokenDef(token_name, p)) return Token('TOKEN', token_name, -1) @@ -323,6 +316,7 @@ class TokenTreeToPattern(Transformer): if len(items) == 1: return items[0] return PatternRE(''.join(i.to_regexp() for i in items)) + def expansions(self, exps): if len(exps) == 1: return exps[0] @@ -361,7 +355,7 @@ class Grammar: for r, tree, _o in rule_defs: for exp in tree.find_data('expansion'): exp.children = list(interleave(exp.children, expr)) - if r == start: # TODO use GrammarRule or similar (RuleOptions?) + if r == start: exp.children = [expr] + exp.children x = [T('expansion', [Token('RULE', x)]) for x in ignore_names] @@ -369,9 +363,7 @@ class Grammar: rule_defs.append(('__ignore', _ignore_tree, None)) # End of "ignore" section - for name, tree in self.token_defs: - rule_defs.append((name, tree, RuleOptions(keep_all_tokens=True))) - + rule_defs += [(name, tree, RuleOptions(keep_all_tokens=True)) for name, tree in self.token_defs] token_defs = [] tokens_to_convert = {name: '__token_'+name for name, tree, _ in rule_defs if is_terminal(name)} @@ -382,12 +374,13 @@ class Grammar: options = RuleOptions.new_from(options, filter_out=True) else: options = RuleOptions.new_from(options, create_token=name) + name = tokens_to_convert[name] - name = tokens_to_convert.get(name, name) for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): for i, sym in enumerate(exp.children): if sym in tokens_to_convert: exp.children[i] = Token(sym.type, tokens_to_convert[sym]) + new_rule_defs.append((name, tree, options)) rule_defs = new_rule_defs @@ -470,13 +463,14 @@ class RuleOptions: expand1=options and options.expand1, **kw) -def _extract_options_for_rule(name, expansions): - keep_all_tokens = name.startswith('!') - name = name.lstrip('!') - expand1 = name.startswith('?') - name = name.lstrip('?') + @classmethod + def from_rule(cls, name, expansions): + keep_all_tokens = name.startswith('!') + name = name.lstrip('!') + expand1 = name.startswith('?') + name = name.lstrip('?') - return name, expansions, RuleOptions(keep_all_tokens, expand1) + return name, expansions, cls(keep_all_tokens, expand1) @@ -515,7 +509,7 @@ class GrammarLoader: def __init__(self): tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] - rules = [_extract_options_for_rule(name, x) for name, x in RULES.items()] + rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()] d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules} rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None) lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None) @@ -595,7 +589,7 @@ class GrammarLoader: raise GrammarError("Token '%s' defined more than once" % name) token_names.add(name) - rules = [_extract_options_for_rule(name, x) for name, x in rule_defs] + rules = [RuleOptions.from_rule(name, x) for name, x in rule_defs] rule_names = set() for name, _x, _o in rules: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 2b5d4d2..1dfe14b 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -38,10 +38,8 @@ class LALR_ContextualLexer: self.parser = lalr_parser.Parser(parser_conf) d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()} - self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, - always_accept=lexer_conf.postlex.always_accept - if lexer_conf.postlex else ()) - + always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () + self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) def parse(self, text): tokens = self.lexer.lex(text) @@ -76,8 +74,7 @@ class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf): WithLexer.__init__(self, lexer_conf) - rules = [(n, self._prepare_expansion(x), a) - for n,x,a in parser_conf.rules] + rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) @@ -102,50 +99,6 @@ def tokenize_text(text): new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos)) return new_text -class Nearley_NoLex: - def __init__(self, lexer_conf, parser_conf): - self.tokens_to_convert = {name: '__token_'+name for name, tree, _ in parser_conf.rules if is_terminal(name)} - rules = [] - for name, exp, alias in parser_conf.rules: - name = self.tokens_to_convert.get(name, name) - exp = [self.tokens_to_convert.get(x, x) for x in exp] - rules.append((name, exp, alias)) - - self.token_by_name = {t.name:t for t in lexer_conf.tokens} - - rules = [{'name':n, - 'symbols': list(self._prepare_expansion(x)), - 'postprocess': getattr(parser_conf.callback, a)} - for n,x,a in rules] - - self.parser = nearley.Parser(rules, parser_conf.start) - - def _prepare_expansion(self, expansion): - for sym in expansion: - if is_terminal(sym): - regexp = self.token_by_name[sym].pattern.to_regexp() - width = sre_parse.parse(regexp).getwidth() - if width != (1,1): - raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width)) - yield sym, re.compile(regexp) - else: - yield sym - - def parse(self, text): - new_text = tokenize_text(text) - res = self.parser.parse(new_text) - assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' - res = res[0] - - class RestoreTokens(Transformer): - pass - - for t in self.tokens_to_convert: - setattr(RestoreTokens, t, ''.join) - - res = RestoreTokens().transform(res) - return res - class Earley_NoLex: def __init__(self, lexer_conf, parser_conf):