| @@ -40,6 +40,9 @@ class Token(Str): | |||||
| def __repr__(self): | def __repr__(self): | ||||
| return 'Token(%s, %r)' % (self.type, self.value) | return 'Token(%s, %r)' % (self.type, self.value) | ||||
| def __deepcopy__(self, memo): | |||||
| return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) | |||||
| class Regex: | class Regex: | ||||
| def __init__(self, pattern, flags=()): | def __init__(self, pattern, flags=()): | ||||
| self.pattern = pattern | self.pattern = pattern | ||||
| @@ -2,6 +2,7 @@ import os.path | |||||
| from itertools import chain | from itertools import chain | ||||
| import re | import re | ||||
| from ast import literal_eval | from ast import literal_eval | ||||
| from copy import deepcopy | |||||
| from .lexer import Token, UnexpectedInput | from .lexer import Token, UnexpectedInput | ||||
| @@ -348,13 +349,15 @@ class Grammar: | |||||
| def compile(self, lexer=False, start=None): | def compile(self, lexer=False, start=None): | ||||
| if not lexer: | if not lexer: | ||||
| rule_defs = deepcopy(self.rule_defs) | |||||
| # XXX VERY HACKY!! There must be a better way.. | # XXX VERY HACKY!! There must be a better way.. | ||||
| ignore_tokens = [('_'+name, t) for name, t in self.token_defs if name in self.extra['ignore']] | ignore_tokens = [('_'+name, t) for name, t in self.token_defs if name in self.extra['ignore']] | ||||
| if ignore_tokens: | if ignore_tokens: | ||||
| self.token_defs = [('_'+name if name in self.extra['ignore'] else name,t) for name,t in self.token_defs] | self.token_defs = [('_'+name if name in self.extra['ignore'] else name,t) for name,t in self.token_defs] | ||||
| ignore_names = [t[0] for t in ignore_tokens] | ignore_names = [t[0] for t in ignore_tokens] | ||||
| expr = Token('RULE', '__ignore') | expr = Token('RULE', '__ignore') | ||||
| for r, tree, _o in self.rule_defs: | |||||
| for r, tree, _o in rule_defs: | |||||
| for exp in tree.find_data('expansion'): | for exp in tree.find_data('expansion'): | ||||
| exp.children = list(interleave(exp.children, expr)) | exp.children = list(interleave(exp.children, expr)) | ||||
| if r == start: # TODO use GrammarRule or similar (RuleOptions?) | if r == start: # TODO use GrammarRule or similar (RuleOptions?) | ||||
| @@ -362,14 +365,34 @@ class Grammar: | |||||
| x = [T('expansion', [Token('RULE', x)]) for x in ignore_names] | x = [T('expansion', [Token('RULE', x)]) for x in ignore_names] | ||||
| _ignore_tree = T('expr', [T('expansions', x), Token('OP', '?')]) | _ignore_tree = T('expr', [T('expansions', x), Token('OP', '?')]) | ||||
| self.rule_defs.append(('__ignore', _ignore_tree, None)) | |||||
| rule_defs.append(('__ignore', _ignore_tree, None)) | |||||
| # End of "ignore" section | |||||
| for name, tree in self.token_defs: | for name, tree in self.token_defs: | ||||
| self.rule_defs.append((name, tree, RuleOptions(keep_all_tokens=True))) | |||||
| rule_defs.append((name, tree, RuleOptions(keep_all_tokens=True))) | |||||
| token_defs = [] | token_defs = [] | ||||
| tokens_to_convert = {name: '__token_'+name for name, tree, _ in rule_defs if is_terminal(name)} | |||||
| new_rule_defs = [] | |||||
| for name, tree, options in rule_defs: | |||||
| if name in tokens_to_convert: | |||||
| if name.startswith('_'): | |||||
| options = RuleOptions.new_from(options, filter_out=True) | |||||
| else: | |||||
| options = RuleOptions.new_from(options, join_children=True) | |||||
| name = tokens_to_convert.get(name, name) | |||||
| for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): | |||||
| for i, sym in enumerate(exp.children): | |||||
| if sym in tokens_to_convert: | |||||
| exp.children[i] = Token(sym.type, tokens_to_convert[sym]) | |||||
| new_rule_defs.append((name, tree, options)) | |||||
| rule_defs = new_rule_defs | |||||
| else: | else: | ||||
| token_defs = list(self.token_defs) | token_defs = list(self.token_defs) | ||||
| rule_defs = self.rule_defs | |||||
| # ================= | # ================= | ||||
| # Compile Tokens | # Compile Tokens | ||||
| @@ -410,7 +433,7 @@ class Grammar: | |||||
| rule_tree_to_text = RuleTreeToText() | rule_tree_to_text = RuleTreeToText() | ||||
| rules = {} | rules = {} | ||||
| for name, rule_tree, options in self.rule_defs: | |||||
| for name, rule_tree, options in rule_defs: | |||||
| assert name not in rules, name | assert name not in rules, name | ||||
| rule_tree = PrepareLiterals().transform(rule_tree) | rule_tree = PrepareLiterals().transform(rule_tree) | ||||
| if not lexer: | if not lexer: | ||||
| @@ -431,9 +454,20 @@ class Grammar: | |||||
| class RuleOptions: | class RuleOptions: | ||||
| def __init__(self, keep_all_tokens=False, expand1=False): | |||||
| def __init__(self, keep_all_tokens=False, expand1=False, join_children=False, filter_out=False): | |||||
| self.keep_all_tokens = keep_all_tokens | self.keep_all_tokens = keep_all_tokens | ||||
| self.expand1 = expand1 | self.expand1 = expand1 | ||||
| self.join_children = join_children # used for scanless postprocessing | |||||
| self.filter_out = filter_out # remove this rule from the tree | |||||
| # used for "token"-rules in scanless | |||||
| @classmethod | |||||
| def new_from(cls, options, **kw): | |||||
| return cls( | |||||
| keep_all_tokens=options and options.keep_all_tokens, | |||||
| expand1=options and options.expand1, | |||||
| **kw) | |||||
| def _extract_options_for_rule(name, expansions): | def _extract_options_for_rule(name, expansions): | ||||
| keep_all_tokens = name.startswith('!') | keep_all_tokens = name.startswith('!') | ||||
| @@ -12,11 +12,19 @@ def create_expand1_tree_builder_function(tree_builder): | |||||
| return tree_builder(children) | return tree_builder(children) | ||||
| return expand1 | return expand1 | ||||
| def create_rule_handler(expansion, usermethod, keep_all_tokens): | |||||
| def create_join_children(tree_builder): | |||||
| def join_children(children): | |||||
| children = [''.join(children)] | |||||
| return tree_builder(children) | |||||
| return join_children | |||||
| def create_rule_handler(expansion, usermethod, keep_all_tokens, filter_out): | |||||
| # if not keep_all_tokens: | # if not keep_all_tokens: | ||||
| to_include = [(i, not is_terminal(sym) and sym.startswith('_')) | to_include = [(i, not is_terminal(sym) and sym.startswith('_')) | ||||
| for i, sym in enumerate(expansion) | for i, sym in enumerate(expansion) | ||||
| if keep_all_tokens or not is_terminal(sym) or not sym.startswith('_')] | |||||
| if keep_all_tokens | |||||
| or not ((is_terminal(sym) and sym.startswith('_')) or sym in filter_out) | |||||
| ] | |||||
| if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | ||||
| def _build_ast(match): | def _build_ast(match): | ||||
| @@ -49,9 +57,17 @@ class ParseTreeBuilder: | |||||
| def create_tree_builder(self, rules, transformer): | def create_tree_builder(self, rules, transformer): | ||||
| callback = Callback() | callback = Callback() | ||||
| new_rules = [] | new_rules = [] | ||||
| filter_out = set() | |||||
| for origin, (expansions, options) in rules.items(): | |||||
| if options and options.filter_out: | |||||
| assert origin.startswith('_') # Just to make sure | |||||
| filter_out.add(origin) | |||||
| for origin, (expansions, options) in rules.items(): | for origin, (expansions, options) in rules.items(): | ||||
| keep_all_tokens = options.keep_all_tokens if options else False | keep_all_tokens = options.keep_all_tokens if options else False | ||||
| expand1 = options.expand1 if options else False | expand1 = options.expand1 if options else False | ||||
| join_children = options.join_children if options else False | |||||
| _origin = origin | _origin = origin | ||||
| @@ -59,8 +75,6 @@ class ParseTreeBuilder: | |||||
| if alias and origin.startswith('_'): | if alias and origin.startswith('_'): | ||||
| raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) | raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) | ||||
| _alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||||
| try: | try: | ||||
| f = transformer._get_func(alias or _origin) | f = transformer._get_func(alias or _origin) | ||||
| except AttributeError: | except AttributeError: | ||||
| @@ -71,12 +85,17 @@ class ParseTreeBuilder: | |||||
| if expand1: | if expand1: | ||||
| f = create_expand1_tree_builder_function(f) | f = create_expand1_tree_builder_function(f) | ||||
| alias_handler = create_rule_handler(expansion, f, keep_all_tokens) | |||||
| if join_children: | |||||
| f = create_join_children(f) | |||||
| alias_handler = create_rule_handler(expansion, f, keep_all_tokens, filter_out) | |||||
| if hasattr(callback, _alias): | |||||
| callback_name = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||||
| if hasattr(callback, callback_name): | |||||
| raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) | raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) | ||||
| setattr(callback, _alias, alias_handler) | |||||
| setattr(callback, callback_name, alias_handler) | |||||
| new_rules.append(( _origin, expansion, _alias )) | |||||
| new_rules.append(( _origin, expansion, callback_name )) | |||||
| return new_rules, callback | return new_rules, callback | ||||
| @@ -149,16 +149,9 @@ class Nearley_NoLex: | |||||
| class Earley_NoLex: | class Earley_NoLex: | ||||
| def __init__(self, lexer_conf, parser_conf): | def __init__(self, lexer_conf, parser_conf): | ||||
| self.tokens_to_convert = {name: '__token_'+name for name, tree, _ in parser_conf.rules if is_terminal(name)} | |||||
| rules = [] | |||||
| for name, exp, alias in parser_conf.rules: | |||||
| name = self.tokens_to_convert.get(name, name) | |||||
| exp = [self.tokens_to_convert.get(x, x) for x in exp] | |||||
| rules.append((name, exp, alias)) | |||||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | self.token_by_name = {t.name:t for t in lexer_conf.tokens} | ||||
| rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in rules] | |||||
| rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||||
| self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | ||||
| @@ -177,16 +170,7 @@ class Earley_NoLex: | |||||
| new_text = tokenize_text(text) | new_text = tokenize_text(text) | ||||
| res = self.parser.parse(new_text) | res = self.parser.parse(new_text) | ||||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | ||||
| res = res[0] | |||||
| class RestoreTokens(Transformer): | |||||
| pass | |||||
| for t in self.tokens_to_convert: | |||||
| setattr(RestoreTokens, t, ''.join) | |||||
| res = RestoreTokens().transform(res) | |||||
| return res | |||||
| return res[0] | |||||
| def get_frontend(parser, lexer): | def get_frontend(parser, lexer): | ||||
| @@ -40,7 +40,7 @@ class Item(object): | |||||
| def __repr__(self): | def __repr__(self): | ||||
| before = map(str, self.rule.expansion[:self.ptr]) | before = map(str, self.rule.expansion[:self.ptr]) | ||||
| after = map(str, self.rule.expansion[self.ptr:]) | after = map(str, self.rule.expansion[self.ptr:]) | ||||
| return '<(%d) %s : %s * %s>' % (self.start, self.rule.origin, ' '.join(before), ' '.join(after)) | |||||
| return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||||
| class NewsList(list): | class NewsList(list): | ||||
| @@ -76,9 +76,20 @@ class Column: | |||||
| for item in items: | for item in items: | ||||
| if item.is_complete: | if item.is_complete: | ||||
| # if item in added: # XXX This causes a bug with empty rules | |||||
| # continue # And might be unnecessary | |||||
| # added.add(item) | |||||
| # (We must allow repetition of empty rules) | |||||
| if item.rule.expansion: | |||||
| # This is an important test to avoid infinite-loops, | |||||
| # For example for the rule: | |||||
| # a: a | "b" | |||||
| # If we can detect these cases statically, we can remove | |||||
| # this test an gain a small optimization | |||||
| # | |||||
| if item in added: | |||||
| continue | |||||
| added.add(item) | |||||
| self.to_reduce.append(item) | self.to_reduce.append(item) | ||||
| else: | else: | ||||
| if is_terminal(item.expect): | if is_terminal(item.expect): | ||||
| @@ -23,12 +23,17 @@ class Reconstructor: | |||||
| tokens = {t.name:t for t in parser.lexer_conf.tokens} | tokens = {t.name:t for t in parser.lexer_conf.tokens} | ||||
| token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in parser.lexer_conf.tokens} | token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in parser.lexer_conf.tokens} | ||||
| class MatchData: | |||||
| class MatchData(object): | |||||
| def __init__(self, data): | def __init__(self, data): | ||||
| self.data = data | self.data = data | ||||
| def __repr__(self): | |||||
| return '%s(%r)' % (type(self).__name__, self.data) | |||||
| class MatchTerminal(MatchData): | class MatchTerminal(MatchData): | ||||
| def __call__(self, other): | def __call__(self, other): | ||||
| if isinstance(other, Tree): | |||||
| return False | |||||
| return token_res[self.data].match(other) is not None | return token_res[self.data].match(other) is not None | ||||
| class MatchTree(MatchData): | class MatchTree(MatchData): | ||||
| @@ -66,8 +71,11 @@ class Reconstructor: | |||||
| return to_write | return to_write | ||||
| # Recreate the rules to assume a standard lexer | |||||
| _tokens, rules, _grammar_extra = parser.grammar.compile(lexer='standard', start='whatever') | |||||
| d = defaultdict(list) | d = defaultdict(list) | ||||
| for name, (expansions, _o) in parser.rules.items(): | |||||
| for name, (expansions, _o) in rules.items(): | |||||
| for expansion, alias in expansions: | for expansion, alias in expansions: | ||||
| if alias: | if alias: | ||||
| d[alias].append(expansion) | d[alias].append(expansion) | ||||