|
- "Parses and creates Grammar objects"
-
- import os.path
- from itertools import chain
- import re
- from ast import literal_eval
- from copy import deepcopy
-
- from .lexer import Token, UnexpectedInput
-
- from .parse_tree_builder import ParseTreeBuilder
- from .parser_frontends import LALR
- from .parsers.lalr_parser import UnexpectedToken
- from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
-
- from .tree import Tree as T, Transformer, InlineTransformer, Visitor
-
- __path__ = os.path.dirname(__file__)
- IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
-
- _TOKEN_NAMES = {
- '.' : 'DOT',
- ',' : 'COMMA',
- ':' : 'COLON',
- ';' : 'SEMICOLON',
- '+' : 'PLUS',
- '-' : 'MINUS',
- '*' : 'STAR',
- '/' : 'SLASH',
- '\\' : 'BACKSLASH',
- '|' : 'VBAR',
- '?' : 'QMARK',
- '!' : 'BANG',
- '@' : 'AT',
- '#' : 'HASH',
- '$' : 'DOLLAR',
- '%' : 'PERCENT',
- '^' : 'CIRCUMFLEX',
- '&' : 'AMPERSAND',
- '_' : 'UNDERSCORE',
- '<' : 'LESSTHAN',
- '>' : 'MORETHAN',
- '=' : 'EQUAL',
- '"' : 'DBLQUOTE',
- '\'' : 'QUOTE',
- '`' : 'BACKQUOTE',
- '~' : 'TILDE',
- '(' : 'LPAR',
- ')' : 'RPAR',
- '{' : 'LBRACE',
- '}' : 'RBRACE',
- '[' : 'LSQB',
- ']' : 'RSQB',
- '\n' : 'NEWLINE',
- '\r\n' : 'CRLF',
- '\t' : 'TAB',
- ' ' : 'SPACE',
- }
-
- # Grammar Parser
- TOKENS = {
- '_LPAR': r'\(',
- '_RPAR': r'\)',
- '_LBRA': r'\[',
- '_RBRA': r'\]',
- 'OP': '[+*][?]?|[?](?![a-z])',
- '_COLON': ':',
- '_OR': r'\|',
- '_DOT': r'\.',
- 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
- 'TOKEN': '_?[A-Z][_A-Z0-9]*',
- 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
- 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/i?',
- '_NL': r'(\r?\n)+\s*',
- 'WS': r'[ \t]+',
- 'COMMENT': r'//[^\n]*',
- '_TO': '->',
- '_IGNORE': r'%ignore',
- '_IMPORT': r'%import',
- 'NUMBER': '\d+',
- }
-
- RULES = {
- 'start': ['_list'],
- '_list': ['_item', '_list _item'],
- '_item': ['rule', 'token', 'statement', '_NL'],
-
- 'rule': ['RULE _COLON expansions _NL',
- 'RULE _DOT NUMBER _COLON expansions _NL'],
- 'expansions': ['alias',
- 'expansions _OR alias',
- 'expansions _NL _OR alias'],
-
- '?alias': ['expansion _TO RULE', 'expansion'],
- 'expansion': ['_expansion'],
-
- '_expansion': ['', '_expansion expr'],
-
- '?expr': ['atom',
- 'atom OP'],
-
- '?atom': ['_LPAR expansions _RPAR',
- 'maybe',
- 'name',
- 'literal',
- 'range'],
-
- '?name': ['RULE', 'TOKEN'],
-
- 'maybe': ['_LBRA expansions _RBRA'],
- 'range': ['STRING _DOT _DOT STRING'],
-
- 'token': ['TOKEN _COLON expansions _NL',
- 'TOKEN _DOT NUMBER _COLON expansions _NL'],
- 'statement': ['ignore', 'import'],
- 'ignore': ['_IGNORE expansions _NL'],
- 'import': ['_IMPORT import_args _NL',
- '_IMPORT import_args _TO TOKEN'],
- 'import_args': ['_import_args'],
- '_import_args': ['name', '_import_args _DOT name'],
-
- 'literal': ['REGEXP', 'STRING'],
- }
-
-
- class EBNF_to_BNF(InlineTransformer):
- def __init__(self):
- self.new_rules = {}
- self.rules_by_expr = {}
- self.prefix = 'anon'
- self.i = 0
- self.rule_options = None
-
- def _add_recurse_rule(self, type_, expr):
- if expr in self.rules_by_expr:
- return self.rules_by_expr[expr]
-
- new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
- self.i += 1
- t = Token('RULE', new_name, -1)
- self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options
- self.rules_by_expr[expr] = t
- return t
-
- def expr(self, rule, op):
- if op.value == '?':
- return T('expansions', [rule, T('expansion', [])])
- elif op.value == '+':
- # a : b c+ d
- # -->
- # a : b _c d
- # _c : _c c | c;
- return self._add_recurse_rule('plus', rule)
- elif op.value == '*':
- # a : b c* d
- # -->
- # a : b _c? d
- # _c : _c c | c;
- new_name = self._add_recurse_rule('star', rule)
- return T('expansions', [new_name, T('expansion', [])])
- assert False, op
-
-
- class SimplifyRule_Visitor(Visitor):
-
- @staticmethod
- def _flatten(tree):
- while True:
- to_expand = [i for i, child in enumerate(tree.children)
- if isinstance(child, T) and child.data == tree.data]
- if not to_expand:
- break
- tree.expand_kids_by_index(*to_expand)
-
-
- def expansion(self, tree):
- # rules_list unpacking
- # a : b (c|d) e
- # -->
- # a : b c e | b d e
- #
- # In AST terms:
- # expansion(b, expansions(c, d), e)
- # -->
- # expansions( expansion(b, c, e), expansion(b, d, e) )
-
- while True:
- self._flatten(tree)
-
- for i, child in enumerate(tree.children):
- if isinstance(child, T) and child.data == 'expansions':
- tree.data = 'expansions'
- tree.children = [self.visit(T('expansion', [option if i==j else other
- for j, other in enumerate(tree.children)]))
- for option in child.children]
- break
- else:
- break
-
- def alias(self, tree):
- rule, alias_name = tree.children
- if rule.data == 'expansions':
- aliases = []
- for child in tree.children[0].children:
- aliases.append(T('alias', [child, alias_name]))
- tree.data = 'expansions'
- tree.children = aliases
-
- expansions = _flatten
-
- class RuleTreeToText(Transformer):
- def expansions(self, x):
- return x
- def expansion(self, symbols):
- return [sym.value for sym in symbols], None
- def alias(self, x):
- (expansion, _alias), alias = x
- assert _alias is None, (alias, expansion, '-', _alias)
- return expansion, alias.value
-
-
- class CanonizeTree(InlineTransformer):
- def maybe(self, expr):
- return T('expr', [expr, Token('OP', '?', -1)])
-
- def tokenmods(self, *args):
- if len(args) == 1:
- return list(args)
- tokenmods, value = args
- return tokenmods + [value]
-
- class ExtractAnonTokens(InlineTransformer):
- "Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them"
-
- def __init__(self, tokens):
- self.tokens = tokens
- self.token_set = {td.name for td in self.tokens}
- self.token_reverse = {td.pattern: td for td in tokens}
- self.i = 0
-
-
- def pattern(self, p):
- value = p.value
- if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags:
- raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)
-
- if isinstance(p, PatternStr):
- try:
- # If already defined, use the user-defined token name
- token_name = self.token_reverse[p].name
- except KeyError:
- # Try to assign an indicative anon-token name, otherwise use a numbered name
- try:
- token_name = _TOKEN_NAMES[value]
- except KeyError:
- if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set:
- token_name = '%s%d' % (value.upper(), self.i)
- try:
- # Make sure we don't have unicode in our token names
- token_name.encode('ascii')
- except UnicodeEncodeError:
- token_name = 'ANONSTR_%d' % self.i
- else:
- token_name = 'ANONSTR_%d' % self.i
- self.i += 1
-
- token_name = '__' + token_name
-
- elif isinstance(p, PatternRE):
- if p in self.token_reverse: # Kind of a wierd placement.name
- token_name = self.token_reverse[p].name
- else:
- token_name = 'ANONRE_%d' % self.i
- self.i += 1
- else:
- assert False, p
-
- if token_name not in self.token_set:
- assert p not in self.token_reverse
- self.token_set.add(token_name)
- tokendef = TokenDef(token_name, p)
- self.token_reverse[p] = tokendef
- self.tokens.append(tokendef)
-
- return Token('TOKEN', token_name, -1)
-
-
- def _literal_to_pattern(literal):
- v = literal.value
- if v[-1] in 'i':
- flags = v[-1]
- v = v[:-1]
- else:
- flags = None
-
- assert v[0] == v[-1] and v[0] in '"/'
- x = v[1:-1].replace("'", r"\'")
- s = literal_eval("u'''%s'''" % x)
- return { 'STRING': PatternStr,
- 'REGEXP': PatternRE }[literal.type](s, flags)
-
-
- class PrepareLiterals(InlineTransformer):
- def literal(self, literal):
- return T('pattern', [_literal_to_pattern(literal)])
-
- def range(self, start, end):
- assert start.type == end.type == 'STRING'
- start = start.value[1:-1]
- end = end.value[1:-1]
- assert len(start) == len(end) == 1
- regexp = '[%s-%s]' % (start, end)
- return T('pattern', [PatternRE(regexp)])
-
- class SplitLiterals(InlineTransformer):
- def pattern(self, p):
- if isinstance(p, PatternStr) and len(p.value)>1:
- return T('expansion', [T('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value])
- return T('pattern', [p])
-
- class TokenTreeToPattern(Transformer):
- def pattern(self, ps):
- p ,= ps
- return p
-
- def expansion(self, items):
- if len(items) == 1:
- return items[0]
- if len({i.flags for i in items}) > 1:
- raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
- return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags)
-
- def expansions(self, exps):
- if len(exps) == 1:
- return exps[0]
- assert all(i.flags is None for i in exps)
- return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)))
-
- def expr(self, args):
- inner, op = args
- return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags)
-
-
- def _interleave(l, item):
- for e in l:
- yield e
- if isinstance(e, T):
- if e.data in ('literal', 'range'):
- yield item
- elif is_terminal(e):
- yield item
-
- def _choice_of_rules(rules):
- return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules])
-
- def dict_update_safe(d1, d2):
- for k, v in d2.items():
- assert k not in d1
- d1[k] = v
-
-
- class Grammar:
- def __init__(self, rule_defs, token_defs, ignore):
- self.token_defs = token_defs
- self.rule_defs = rule_defs
- self.ignore = ignore
-
- def _prepare_scanless_grammar(self, start):
- # XXX Pretty hacky! There should be a better way to write this method..
-
- rule_defs = deepcopy(self.rule_defs)
- term_defs = self.token_defs
-
- # Implement the "%ignore" feature without a lexer..
- terms_to_ignore = {name:'__'+name for name in self.ignore}
- if terms_to_ignore:
- assert set(terms_to_ignore) <= {name for name, _t in term_defs}
- term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
- expr = Token('RULE', '__ignore')
- for r, tree, _o in rule_defs:
- for exp in tree.find_data('expansion'):
- exp.children = list(_interleave(exp.children, expr))
- if r == start:
- exp.children = [expr] + exp.children
- for exp in tree.find_data('expr'):
- exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr)))
-
- _ignore_tree = T('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')])
- rule_defs.append(('__ignore', _ignore_tree, None))
-
- # Convert all tokens to rules
- new_terminal_names = {name: '__token_'+name for name, _t in term_defs}
-
- for name, tree, options in rule_defs:
- for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ):
- for i, sym in enumerate(exp.children):
- if sym in new_terminal_names:
- exp.children[i] = Token(sym.type, new_terminal_names[sym])
-
- for name, (tree, priority) in term_defs: # TODO transfer priority to rule?
- if name.startswith('_'):
- options = RuleOptions(filter_out=True, priority=priority)
- else:
- options = RuleOptions(keep_all_tokens=True, create_token=name, priority=priority)
-
- name = new_terminal_names[name]
- inner_name = name + '_inner'
- rule_defs.append((name, _choice_of_rules([inner_name]), None))
- rule_defs.append((inner_name, tree, options))
-
- return [], rule_defs
-
-
- def compile(self, lexer=False, start=None):
- if not lexer:
- token_defs, rule_defs = self._prepare_scanless_grammar(start)
- else:
- token_defs = list(self.token_defs)
- rule_defs = self.rule_defs
-
- # =================
- # Compile Tokens
- # =================
-
- # Convert token-trees to strings/regexps
- transformer = PrepareLiterals() * TokenTreeToPattern()
- tokens = [TokenDef(name, transformer.transform(token_tree), priority)
- for name, (token_tree, priority) in token_defs]
-
- # =================
- # Compile Rules
- # =================
- ebnf_to_bnf = EBNF_to_BNF()
- simplify_rule = SimplifyRule_Visitor()
-
- transformer = PrepareLiterals()
- if not lexer:
- transformer *= SplitLiterals()
- transformer *= ExtractAnonTokens(tokens) # Adds to tokens
-
- rules = {}
- for name, rule_tree, options in rule_defs:
- assert name not in rules, name
- ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
- tree = transformer.transform(rule_tree)
- rules[name] = ebnf_to_bnf.transform(tree), options
-
- dict_update_safe(rules, ebnf_to_bnf.new_rules)
-
- for tree, _o in rules.values():
- simplify_rule.visit(tree)
-
- rule_tree_to_text = RuleTreeToText()
- rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()}
-
- return tokens, rules, self.ignore
-
-
-
- class RuleOptions:
- def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
- self.keep_all_tokens = keep_all_tokens
- self.expand1 = expand1
- self.create_token = create_token # used for scanless postprocessing
- self.priority = priority
-
- self.filter_out = filter_out # remove this rule from the tree
- # used for "token"-rules in scanless
- @classmethod
- def from_rule(cls, name, *x):
- if len(x) > 1:
- priority, expansions = x
- priority = int(priority)
- else:
- expansions ,= x
- priority = None
-
- keep_all_tokens = name.startswith('!')
- name = name.lstrip('!')
- expand1 = name.startswith('?')
- name = name.lstrip('?')
-
- return name, expansions, cls(keep_all_tokens, expand1, priority=priority)
-
-
-
- _imported_grammars = {}
- def import_grammar(grammar_path):
- if grammar_path not in _imported_grammars:
- for import_path in IMPORT_PATHS:
- with open(os.path.join(import_path, grammar_path)) as f:
- text = f.read()
- grammar = load_grammar(text, grammar_path)
- _imported_grammars[grammar_path] = grammar
-
- return _imported_grammars[grammar_path]
-
-
- def resolve_token_references(token_defs):
- # TODO Cycles detection
- # TODO Solve with transitive closure (maybe)
-
- token_dict = {k:t for k, (t,_p) in token_defs}
- assert len(token_dict) == len(token_defs), "Same name defined twice?"
-
- while True:
- changed = False
- for name, (token_tree, _p) in token_defs:
- for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')):
- for i, item in enumerate(exp.children):
- if isinstance(item, Token):
- if item.type == 'RULE':
- raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name))
- if item.type == 'TOKEN':
- exp.children[i] = token_dict[item]
- changed = True
- if not changed:
- break
-
-
- class GrammarLoader:
- def __init__(self):
- tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]
-
- rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()]
- d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
- rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
- lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
- parser_conf = ParserConf(rules, callback, 'start')
- self.parser = LALR(lexer_conf, parser_conf)
-
- self.canonize_tree = CanonizeTree()
-
- def load_grammar(self, grammar_text, name='<?>'):
- "Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
-
- try:
- tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') )
- except UnexpectedInput as e:
- raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name))
- except UnexpectedToken as e:
- if e.expected == ['_COLON']:
- raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
- elif e.expected == ['RULE']:
- raise GrammarError("Missing alias at line %s column %s" % (e.line, e.column))
- elif 'STRING' in e.expected:
- raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
- elif e.expected == ['_OR']:
- raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column))
- raise
-
- # Extract grammar items
-
- token_defs = [c.children for c in tree.children if c.data=='token']
- rule_defs = [c.children for c in tree.children if c.data=='rule']
- statements = [c.children for c in tree.children if c.data=='statement']
- assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children)
-
- token_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in token_defs]
-
- token_defs = [(name.value, (t, int(p))) for name, p, t in token_defs]
-
- # Execute statements
- ignore = []
- for (stmt,) in statements:
- if stmt.data == 'ignore':
- t ,= stmt.children
- ignore.append(t)
- elif stmt.data == 'import':
- dotted_path = stmt.children[0].children
- name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1]
- grammar_path = os.path.join(*dotted_path[:-1]) + '.g'
- g = import_grammar(grammar_path)
- token_options = dict(g.token_defs)[dotted_path[-1]]
- assert isinstance(token_options, tuple) and len(token_options)==2
- token_defs.append([name.value, token_options])
- else:
- assert False, stmt
-
-
- # Verify correctness 1
- for name, _ in token_defs:
- if name.startswith('__'):
- raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
-
- # Handle ignore tokens
- # XXX A slightly hacky solution. Recognition of %ignore TOKEN as separate comes from the lexer's
- # inability to handle duplicate tokens (two names, one value)
- ignore_names = []
- for t in ignore:
- if t.data=='expansions' and len(t.children) == 1:
- t2 ,= t.children
- if t2.data=='expansion' and len(t2.children) == 1:
- item ,= t2.children
- if isinstance(item, Token) and item.type == 'TOKEN':
- ignore_names.append(item.value)
- continue
-
- name = '__IGNORE_%d'% len(ignore_names)
- ignore_names.append(name)
- token_defs.append((name, (t, 0)))
-
-
- # Verify correctness 2
- token_names = set()
- for name, _ in token_defs:
- if name in token_names:
- raise GrammarError("Token '%s' defined more than once" % name)
- token_names.add(name)
-
- # Resolve token references
- resolve_token_references(token_defs)
-
- rules = [RuleOptions.from_rule(*x) for x in rule_defs]
-
- rule_names = set()
- for name, _x, _o in rules:
- if name.startswith('__'):
- raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
- if name in rule_names:
- raise GrammarError("Rule '%s' defined more than once" % name)
- rule_names.add(name)
-
- for name, expansions, _o in rules:
- used_symbols = {t for x in expansions.find_data('expansion')
- for t in x.scan_values(lambda t: t.type in ('RULE', 'TOKEN'))}
- for sym in used_symbols:
- if is_terminal(sym):
- if sym not in token_names:
- raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
- else:
- if sym not in rule_names:
- raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))
-
- # TODO don't include unused tokens, they can only cause trouble!
-
- return Grammar(rules, token_defs, ignore_names)
-
-
-
- load_grammar = GrammarLoader().load_grammar
|