|
- import re
- import codecs
-
- from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regexp
-
- from .parse_tree_builder import ParseTreeBuilder
- from .parser_frontends import LALR
- from .parsers.lalr_parser import UnexpectedToken
- from .common import is_terminal, GrammarError, LexerConf, ParserConf
-
- from .tree import Tree as T, Transformer, InlineTransformer, Visitor
-
- unicode_escape = codecs.getdecoder('unicode_escape')
-
- _TOKEN_NAMES = {
- '.' : 'DOT',
- ',' : 'COMMA',
- ':' : 'COLON',
- ';' : 'SEMICOLON',
- '+' : 'PLUS',
- '-' : 'MINUS',
- '*' : 'STAR',
- '/' : 'SLASH',
- '\\' : 'BACKSLASH',
- '|' : 'VBAR',
- '?' : 'QMARK',
- '!' : 'BANG',
- '@' : 'AT',
- '#' : 'HASH',
- '$' : 'DOLLAR',
- '%' : 'PERCENT',
- '^' : 'CIRCUMFLEX',
- '&' : 'AMPERSAND',
- '_' : 'UNDERSCORE',
- '<' : 'LESSTHAN',
- '>' : 'MORETHAN',
- '=' : 'EQUAL',
- '"' : 'DBLQUOTE',
- '\'' : 'QUOTE',
- '`' : 'BACKQUOTE',
- '~' : 'TILDE',
- '(' : 'LPAR',
- ')' : 'RPAR',
- '{' : 'LBRACE',
- '}' : 'RBRACE',
- '[' : 'LSQB',
- ']' : 'RSQB',
- '\n' : 'NEWLINE',
- '\r\n' : 'CRLF',
- '\t' : 'TAB',
- ' ' : 'SPACE',
- }
-
- # Grammar Parser
- TOKENS = {
- '_LPAR': r'\(',
- '_RPAR': r'\)',
- '_LBRA': r'\[',
- '_RBRA': r'\]',
- 'OP': '[+*?](?![a-z])',
- '_COLON': ':',
- '_OR': r'\|',
- '_DOT': r'\.',
- 'RULE': '[_?*]?[a-z][_a-z0-9]*',
- 'TOKEN': '_?[A-Z][_A-Z0-9]*',
- 'STRING': r'".*?[^\\]"',
- 'REGEXP': r"/(?!/).*?[^\\]/",
- '_NL': r'(\r?\n)+\s*',
- 'WS': r'[ \t]+',
- 'COMMENT': r'//[^\n]*\n',
- '_TO': '->'
- }
-
- RULES = {
- 'start': ['list'],
- 'list': ['item', 'list item'],
- 'item': ['rule', 'token', '_NL'],
-
- 'rule': ['RULE _COLON expansions _NL'],
- 'expansions': ['alias',
- 'expansions _OR alias',
- 'expansions _NL _OR alias'],
-
- '?alias': ['expansion _TO RULE', 'expansion'],
- 'expansion': ['_expansion'],
-
- '_expansion': ['', '_expansion expr'],
-
- '?expr': ['atom',
- 'atom OP'],
-
- '?atom': ['_LPAR expansions _RPAR',
- 'maybe',
- 'RULE',
- 'TOKEN',
- 'anontoken'],
-
- 'anontoken': ['tokenvalue'],
-
- 'maybe': ['_LBRA expansions _RBRA'],
-
- 'token': ['TOKEN _COLON tokenvalue _NL',
- 'TOKEN tokenmods _COLON tokenvalue _NL'],
-
- '?tokenvalue': ['REGEXP', 'STRING'],
- 'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
- }
-
-
- class EBNF_to_BNF(InlineTransformer):
- def __init__(self):
- self.new_rules = {}
- self.rules_by_expr = {}
- self.prefix = 'anon'
- self.i = 0
-
- def _add_recurse_rule(self, type_, expr):
- if expr in self.rules_by_expr:
- return self.rules_by_expr[expr]
-
- new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
- self.i += 1
- t = Token('RULE', new_name, -1)
- self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
- self.rules_by_expr[expr] = t
- return t
-
- def expr(self, rule, op):
- if op.value == '?':
- return T('expansions', [rule, T('expansion', [])])
- elif op.value == '+':
- # a : b c+ d
- # -->
- # a : b _c d
- # _c : _c c | c;
- return self._add_recurse_rule('plus', rule)
- elif op.value == '*':
- # a : b c* d
- # -->
- # a : b _c? d
- # _c : _c c | c;
- new_name = self._add_recurse_rule('star', rule)
- return T('expansions', [new_name, T('expansion', [])])
- assert False, op
-
-
- class SimplifyRule_Visitor(Visitor):
-
- @staticmethod
- def _flatten(tree):
- while True:
- to_expand = [i for i, child in enumerate(tree.children)
- if isinstance(child, T) and child.data == tree.data]
- if not to_expand:
- break
- tree.expand_kids_by_index(*to_expand)
-
-
- def expansion(self, tree):
- # rules_list unpacking
- # a : b (c|d) e
- # -->
- # a : b c e | b d e
- #
- # In AST terms:
- # expansion(b, expansions(c, d), e)
- # -->
- # expansions( expansion(b, c, e), expansion(b, d, e) )
-
- while True:
- self._flatten(tree)
-
- for i, child in enumerate(tree.children):
- if isinstance(child, T) and child.data == 'expansions':
- tree.data = 'expansions'
- tree.children = [self.visit(T('expansion', [option if i==j else other
- for j, other in enumerate(tree.children)]))
- for option in child.children]
- break
- else:
- break
-
- def alias(self, tree):
- rule, alias_name = tree.children
- if rule.data == 'expansions':
- aliases = []
- for child in tree.children[0].children:
- aliases.append(T('alias', [child, alias_name]))
- tree.data = 'expansions'
- tree.children = aliases
-
- expansions = _flatten
-
- def dict_update_safe(d1, d2):
- for k, v in d2.items():
- assert k not in d1
- d1[k] = v
-
-
- class RuleTreeToText(Transformer):
- def expansions(self, x):
- return x
- def expansion(self, symbols):
- return [sym.value for sym in symbols], None
- def alias(self, x):
- (expansion, _alias), alias = x
- assert _alias is None, (alias, expansion, '-', _alias)
- return expansion, alias.value
-
-
- class SimplifyTree(InlineTransformer):
- def maybe(self, expr):
- return T('expr', [expr, Token('OP', '?', -1)])
-
- def tokenmods(self, *args):
- if len(args) == 1:
- return list(args)
- tokenmods, value = args
- return tokenmods + [value]
-
- def get_tokens(tree, token_set):
- tokens = []
- for t in tree.find_data('token'):
- x = t.children
- name = x[0].value
- assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
- if name in token_set:
- raise ValueError("Token '%s' defined more than once" % name)
- token_set.add(name)
-
- if len(x) == 2:
- yield name, x[1], []
- else:
- assert len(x) == 3
- yield name, x[2], x[1]
-
- class ExtractAnonTokens(InlineTransformer):
- def __init__(self, tokens, token_set):
- self.tokens = tokens
- self.token_set = token_set
- self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}
- self.i = 0
-
- def anontoken(self, token):
- if token.type == 'STRING':
- value = token.value[1:-1]
- try:
- # If already defined, use the user-defined token name
- token_name = self.token_reverse[value]
- except KeyError:
- # Try to assign an indicative anon-token name, otherwise use a numbered name
- try:
- token_name = _TOKEN_NAMES[value]
- except KeyError:
- if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set:
- token_name = value.upper() # This can create name duplications for unidentical tokens
- else:
- token_name = 'ANONSTR_%d' % self.i
- self.i += 1
- token_name = '__' + token_name
-
- elif token.type == 'REGEXP':
- token_name = 'ANONRE_%d' % self.i
- value = token.value
- self.i += 1
- else:
- assert False, token
-
- if token_name not in self.token_set:
- self.token_set.add(token_name)
- self.tokens.append((token_name, token, []))
- assert value not in self.token_reverse
- self.token_reverse[value] = token_name
-
- return Token('TOKEN', token_name, -1)
-
-
-
- class GrammarLoader:
- def __init__(self):
- tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()]
-
- d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
- rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
- lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None)
- parser_conf = ParserConf(rules, callback, 'start')
- self.parser = LALR(lexer_conf, parser_conf)
-
- self.simplify_tree = SimplifyTree()
- self.simplify_rule = SimplifyRule_Visitor()
- self.rule_tree_to_text = RuleTreeToText()
-
- def load_grammar(self, grammar_text):
- try:
- tree = self.simplify_tree.transform( self.parser.parse(grammar_text+'\n') )
- except UnexpectedInput as e:
- raise GrammarError("Unexpected input %r at line %d column %d" % (e.context, e.line, e.column))
- except UnexpectedToken as e:
- if '_COLON' in e.expected:
- raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
- elif 'tokenvalue' in e.expected:
- raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
- raise
-
- # =================
- # Process Tokens
- # =================
-
- token_set = set()
- tokens = list(get_tokens(tree, token_set))
- extract_anon = ExtractAnonTokens(tokens, token_set)
- tree = extract_anon.transform(tree) # Adds to tokens
-
- token_ref = {}
- tokendefs = []
- for name, token, flags in tokens:
- value = token.value[1:-1]
- if r'\u' in value:
- # XXX for now, you can't mix unicode escaping and unicode characters at the same token
- value = unicode_escape(value)[0]
-
- if token.type == 'REGEXP':
- sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
- if sp:
- value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
- for x in sp)
-
- token_ref[name] = value
- tokendef = TokenDef__Regexp(name, value)
- else:
- assert token.type == 'STRING'
- tokendef = TokenDef__Str(name, value)
-
- tokendefs.append((tokendef, flags))
-
- # =================
- # Process Rules
- # =================
-
- ebnf_to_bnf = EBNF_to_BNF()
-
- rules = {}
- for rule in tree.find_data('rule'):
- name, ebnf_tree = rule.children
- name = name.value
- if name in rules:
- raise ValueError("Rule '%s' defined more than once" % name)
-
- rules[name] = ebnf_to_bnf.transform(ebnf_tree)
-
- dict_update_safe(rules, ebnf_to_bnf.new_rules)
-
- for r in rules.values():
- self.simplify_rule.visit(r)
-
- rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}
-
- # ====================
- # Verify correctness
- # ====================
- used_symbols = {symbol for expansions in rules.values()
- for expansion, _alias in expansions
- for symbol in expansion}
- rule_set = {r.lstrip('?') for r in rules}
- for sym in used_symbols:
- if is_terminal(sym):
- if sym not in token_set:
- raise GrammarError("Token '%s' used but not defined" % sym)
- else:
- if sym not in rule_set:
- raise GrammarError("Rule '%s' used but not defined" % sym)
-
- return tokendefs, rules
-
- load_grammar = GrammarLoader().load_grammar
|